| Title: | Integrated Text Mining Solution |
|---|---|
| Description: | An integrated solution to perform a series of text mining tasks such as importing and cleaning a corpus, and analyses like terms and documents counts, lexical summary, terms co-occurrences and documents similarity measures, graphs of terms, correspondence analysis and hierarchical clustering. Corpora can be imported from spreadsheet-like files, directories of raw text files, as well as from 'Dow Jones Factiva', 'LexisNexis', 'Europresse' and 'Alceste' files. |
| Authors: | Milan Bouchet-Valat [aut, cre], Gilles Bastin [aut], Antoine Chollet [aut] |
| Maintainer: | Milan Bouchet-Valat <[email protected]> |
| License: | GPL (>= 2) |
| Version: | 0.1.3 |
| Built: | 2026-05-23 06:34:04 UTC |
| Source: | https://github.com/nalimilan/r.temis |
Add a meta-data variable to a corpus indicating the cluster to which each document belongs.
add_clusters(corpus, clust)add_clusters(corpus, clust)
corpus |
A |
clust |
A |
A Corpus object with meta(corpus, "cluster") indicating the cluster
of each document.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) res <- corpus_ca(corpus, dtm, ncp=2, sparsity=0.98) clust <- corpus_clustering(res, 3) corpus <- add_clusters(corpus, clust) meta(corpus)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) res <- corpus_ca(corpus, dtm, ncp=2, sparsity=0.98) clust <- corpus_clustering(res, 3) corpus <- add_clusters(corpus, clust) meta(corpus)
Compute document-term matrix from a corpus.
build_dtm( corpus, sparsity = 1, dictionary = NULL, remove_stopwords = FALSE, tolower = TRUE, remove_punctuation = TRUE, remove_numbers = TRUE, min_length = 2 )build_dtm( corpus, sparsity = 1, dictionary = NULL, remove_stopwords = FALSE, tolower = TRUE, remove_punctuation = TRUE, remove_numbers = TRUE, min_length = 2 )
corpus |
A |
sparsity |
Value between 0 and 1 indicating the proportion of documents
with no occurrences of a term above which that term should be dropped. By default
all terms are kept ( |
dictionary |
A vector of terms to which the matrix should be restricted.
By default, all words with more than |
remove_stopwords |
Whether to remove stopwords appearing in a language-specific list
(see |
tolower |
Whether to convert all text to lower case. |
remove_punctuation |
Whether to remove all punctuation from text before tokenizing terms. |
remove_numbers |
Whether to remove all numbers from text before tokenizing terms. |
min_length |
The minimal number of characters for a word to be retained. |
A DocumentTermMatrix object.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") build_dtm(corpus)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") build_dtm(corpus)
Print documents which are the most characteristic of each level of a variable, i.e. those with the lowest Chi-squared distance to the average vocabulary of documents belonging to that level.
characteristic_docs(corpus, dtm, variable, ndocs = 10, nterms = 25, p = 0.1)characteristic_docs(corpus, dtm, variable, ndocs = 10, nterms = 25, p = 0.1)
corpus |
A |
dtm |
A |
variable |
A vector of values giving the groups for which most frequent terms should be reported. |
ndocs |
The number of (most characteristic) documents to print. |
nterms |
The number of terms to highlight in documents. |
p |
The maximum p-value up to which specific terms should be hightlighted. |
Occurrences of the nterms most specific terms for each level are highlighted.
If stemming or other transformations have been applied to original words
using combine_terms, all original words which have been transformed
to the specified terms are highlighted.
A list with one Corpus object for each level (invisibly).
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) characteristic_docs(corpus, dtm, meta(corpus)$Date) # Also works when terms have been combined dict <- dictionary(dtm) dtm2 <- combine_terms(dtm, dict) characteristic_docs(corpus, dtm2, meta(corpus)$Date)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) characteristic_docs(corpus, dtm, meta(corpus)$Date) # Also works when terms have been combined dict <- dictionary(dtm) dtm2 <- combine_terms(dtm, dict) characteristic_docs(corpus, dtm2, meta(corpus)$Date)
Aggregate terms in a document-term matrix to according to groupings specified by a dictionary.
combine_terms(dtm, dict)combine_terms(dtm, dict)
dtm |
A |
dict |
A |
If several terms use the same transformation, they will be aggregated together.
Terms missing from dict will be dropped.
An aggregated DocumentTermMatrix object.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) dict <- dictionary(dtm) combine_terms(dtm, dict)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) dict <- dictionary(dtm) combine_terms(dtm, dict)
Print documents which contain one or more terms and return a sub-corpus with these documents.
concordances(corpus, dtm, terms, all = FALSE)concordances(corpus, dtm, terms, all = FALSE)
corpus |
A |
dtm |
A |
terms |
One of more terms appearing in |
all |
Whether only documents containing all terms should be printed. By default, documents need to contain at least one of the terms. |
Occurrences of the specified terms are highlighted. If stemming
or other transformations have been applied to original words using
combine_terms, all original words which have been transformed
to the specified terms are highlighted.
Corpus object (invisibly).
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) concordances(corpus, dtm, "oil") concordances(corpus, dtm, c("oil", "opec")) concordances(corpus, dtm, c("oil", "opec"), all=TRUE) # Also works when terms have been combined dict <- dictionary(dtm) dtm2 <- combine_terms(dtm, dict) concordances(corpus, dtm2, "product")file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) concordances(corpus, dtm, "oil") concordances(corpus, dtm, c("oil", "opec")) concordances(corpus, dtm, c("oil", "opec"), all=TRUE) # Also works when terms have been combined dict <- dictionary(dtm) dtm2 <- combine_terms(dtm, dict) concordances(corpus, dtm2, "product")
Print documents which contribute the most to an axis of correspondence analysis.
contributive_docs(corpus, ca, axis, ndocs = 10, nterms = 25)contributive_docs(corpus, ca, axis, ndocs = 10, nterms = 25)
corpus |
A |
ca |
A |
axis |
The CA axis to consider. |
ndocs |
The number of (most contributive) documents to print. |
nterms |
The number of terms to highlight in documents. |
Occurrences of the nterms most contributive terms are highlighted.
If stemming or other transformations have been applied to original words
using combine_terms, all original words which have been transformed
to the specified terms are highlighted.
Corpus object (invisibly).
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) ca <- corpus_ca(corpus, dtm) contributive_docs(corpus, ca, 1) # Also works when terms have been combined dict <- dictionary(dtm) dtm2 <- combine_terms(dtm, dict) ca2 <- corpus_ca(corpus, dtm2) contributive_docs(corpus, ca2, 1)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) ca <- corpus_ca(corpus, dtm) contributive_docs(corpus, ca, 1) # Also works when terms have been combined dict <- dictionary(dtm) dtm2 <- combine_terms(dtm, dict) ca2 <- corpus_ca(corpus, dtm2) contributive_docs(corpus, ca2, 1)
Show terms that are the most associated (positively or negatively) with a reference term.
cooc_terms( dtm, term, variable = NULL, p = 0.1, n = 25, sparsity = 1, min_occ = 2 )cooc_terms( dtm, term, variable = NULL, p = 0.1, n = 25, sparsity = 1, min_occ = 2 )
dtm |
A |
term |
A reference term appearing in |
variable |
An optional vector of values giving the groups for which most frequent terms should be reported. |
p |
The maximum p-value up to which terms should be reported. |
n |
The maximal number of terms to report (for each group, if applicable). |
sparsity |
Value between 0 and 1 indicating the proportion of documents
with no occurrences of a term above which that term should be dropped. By default
all terms are kept ( |
min_occ |
The minimum number of occurrences in the whole |
Co-occurrent terms are those which are specific to documents which contain
the given term. The output is the same as that returned by specific_terms.
A list of matrices, one for each level of the variable, with columns:
"\ in documents where the chosen term is also present.
"\ where the chosen term is also present (rather than in documents where it does not appear), i.e. the percent of cooccurrences for the term..
"Global \ in the corpus (or in the subset of the corpus corresponding to the variable level).
"Level": the number of cooccurrences of the term.
"Global": the number of occurrences of the term in the corpus (or in the subset of the corpus corresponding to the variable level).
"t value": the quantile of a normal distribution corresponding the probability "Prob.".
"Prob.": the probability of observing such an extreme (high or low) number of occurrences of the term in documents where the chosen term is also present, under an hypergeometric distribution.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) cooc_terms(dtm, "barrel") cooc_terms(dtm, "barrel", meta(corpus)$Date)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) cooc_terms(dtm, "barrel") cooc_terms(dtm, "barrel", meta(corpus)$Date)
Run a correspondence analysis on a corpus.
corpus_ca(corpus, dtm, variables = NULL, ncp = 5, sparsity = 1, ...)corpus_ca(corpus, dtm, variables = NULL, ncp = 5, sparsity = 1, ...)
corpus |
A |
dtm |
A |
variables |
An optional list of variables in |
ncp |
The number of axes to compute (5 by default). Note that this determines the number
of axes that will be used for clustering by |
sparsity |
Value between 0 and 1 indicating the proportion of documents
with no occurrences of a term above which that term should be dropped. By default
all terms are kept ( |
... |
Additional arguments passed to |
A CA object containing the correspondence analysis results.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) corpus_ca(corpus, dtm, ncp=3, sparsity=0.98)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) corpus_ca(corpus, dtm, ncp=3, sparsity=0.98)
Run a hierarchical clustering on documents of a corpus based on
a correspondence analysis. The number of axes from ca which are
used depends on the value of the n argument passed to corpus_ca.
corpus_clustering(ca, n = 0)corpus_clustering(ca, n = 0)
ca |
|
n |
Number of clusters to create. If 0 (the default), it is determined by clicking on the plot to choose the cut height. |
A HCPC object.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) res <- corpus_ca(corpus, dtm, ncp=2, sparsity=0.98) corpus_clustering(res, 3)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) res <- corpus_ca(corpus, dtm, ncp=2, sparsity=0.98) corpus_clustering(res, 3)
Create a dictionary with information on all words in a corpus.
dictionary(dtm, remove_stopwords = FALSE)dictionary(dtm, remove_stopwords = FALSE)
dtm |
A |
remove_stopwords |
Whether stopwords should be removed from the dictionary. |
A data.frame with row names indicating the terms, and columns giving the stem,
the number of occurrences, and whether the term is a stopword.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) dictionary(dtm)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) dictionary(dtm)
Print documents which have the most extreme coordinations on an axis of correspondence analysis.
extreme_docs(corpus, ca, axis, ndocs = 10, nterms = 25)extreme_docs(corpus, ca, axis, ndocs = 10, nterms = 25)
corpus |
A |
ca |
A |
axis |
The CA axis to consider. |
ndocs |
The number of (most contributive) documents to print. |
nterms |
The number of terms to highlight in documents. |
Occurrences of the nterms most extreme terms are highlighted.
If stemming or other transformations have been applied to original words
using combine_terms, all original words which have been transformed
to the specified terms are highlighted.
Corpus object (invisibly).
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) ca <- corpus_ca(corpus, dtm) contributive_docs(corpus, ca, 1) # Also works when terms have been combined dict <- dictionary(dtm) dtm2 <- combine_terms(dtm, dict) ca2 <- corpus_ca(corpus, dtm2) extreme_docs(corpus, ca2, 1)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) ca <- corpus_ca(corpus, dtm) contributive_docs(corpus, ca, 1) # Also works when terms have been combined dict <- dictionary(dtm) dtm2 <- combine_terms(dtm, dict) ca2 <- corpus_ca(corpus, dtm2) extreme_docs(corpus, ca2, 1)
List terms with the highest number of occurrences in the document-term matrix of a corpus, possibly grouped by the levels of a variable.
frequent_terms(dtm, variable = NULL, n = 25)frequent_terms(dtm, variable = NULL, n = 25)
dtm |
A |
variable |
An optional vector of values giving the groups for which most frequent terms should be reported. |
n |
The maximal number of terms to report (for each group, if applicable). |
A list of matrices, one for each level of the variable, with columns:
"\
"\ (rather than in other levels).
"Global \
"Level": the number of occurrences of the term in the level ("internal").
"Global": the number of occurrences of the term in the corpus.
"t value": the quantile of a normal distribution corresponding the probability "Prob.".
"Prob.": the probability of observing such an extreme (high or low) number of occurrences of the term in the level, under an hypergeometric distribution.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) frequent_terms(dtm) frequent_terms(dtm, meta(corpus)$Date)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) frequent_terms(dtm) frequent_terms(dtm, meta(corpus)$Date)
Import a corpus from a file.
import_corpus(paths, format, language, textcolumn = 1, encoding = NULL)import_corpus(paths, format, language, textcolumn = 1, encoding = NULL)
paths |
Path to one of more files, or to a directory (if |
format |
File format: can be |
language |
The language name or code (preferably as IETF language tags,
see |
textcolumn |
When |
encoding |
The character encoding of the file, or |
A Corpus object.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") import_corpus(file, "factiva", language="en")file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") import_corpus(file, "factiva", language="en")
Build a lexical summary table, optionally over a variable.
lexical_summary(dtm, corpus, variable = NULL, unit = c("document", "global"))lexical_summary(dtm, corpus, variable = NULL, unit = c("document", "global"))
dtm |
A |
corpus |
A |
variable |
An optional vector with one element per document indicating to which category it belongs. If 'NULL, per-document measures are returned. |
unit |
When |
Words are defined as the forms of two or more characters present in the texts
before stemming and stopword removal. On the contrary, unique terms are extracted
from dtm, which means they do not include words that were removed from it, and that
words different in the original text might become identical terms if stemming was performed.
Please note that percentages for terms and words are computed with regard
respectively to the total number of terms and of words, so the denominators are not the
same for all measures.
When variable is not NULL, unit defines two different ways of
aggregating per-document statistics into per-category measures:
"document": values computed for each document are simply averaged for each category.
"global": values are computed for each category taken as a whole: word counts are summed for each category, and ratios and averages are calculated for this level only, from the summed counts.
This distinction does not make sense when variable=NULL: in this case, "level"
in the above explanation corresponds to "document", and two columns are provided about
the whole corpus.
"Corpus mean" is simply the average value of measures over all documents
"Corpus total" is the sum of the number of terms, the percentage of terms (ratio of the summed numbers of terms) and the average word length in the corpus when taken as a single document.
A table object with the following information for each document or
each category of documents in the corpus:
total number of terms
number and percent of unique terms (i.e. appearing at least once) number and percent of hapax legomena (i.e. terms appearing once and only once)
total number of words
number and percent of long words (defined as at least seven characters)
number and percent of very long words (defined as at least ten characters)
average word length
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) lexical_summary(dtm, corpus)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) lexical_summary(dtm, corpus)
Set corpus meta-data variables from a data frame.
set_corpus_variables(corpus, dset)set_corpus_variables(corpus, dset)
corpus |
A |
dset |
A |
A Corpus object with meta-data added.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) dset <- data.frame(x=1:length(corpus)) corpus <- set_corpus_variables(corpus, dset)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) dset <- data.frame(x=1:length(corpus)) corpus <- set_corpus_variables(corpus, dset)
List terms most associated (positively or negatively) with each document or each of a variable's levels.
specific_terms( dtm, variable = NULL, p = 0.1, n = 25, sparsity = 1, min_occ = 2 )specific_terms( dtm, variable = NULL, p = 0.1, n = 25, sparsity = 1, min_occ = 2 )
dtm |
A |
variable |
An optional vector of values giving the groups for which most frequent terms should be reported. |
p |
The maximum p-value up to which terms should be reported. |
n |
The maximal number of terms to report (for each group, if applicable). |
sparsity |
Value between 0 and 1 indicating the proportion of documents
with no occurrences of a term above which that term should be dropped. By default
all terms are kept ( |
min_occ |
The minimum number of occurrences in the whole |
Specific terms reported here are those whose observed frequency in the document or level has the lowest probability under an hypergeometric distribution, based on their global frequencies in the corpus and on the number of occurrences of all terms in the document or variable level considered. The positive or negative character of the association is visible from the sign of the t value, or by comparing the value of the "\ column.
A list of matrices, one for each level of the variable, with columns:
"\
"\ (rather than in other levels).
"Global \
"Level": the number of occurrences of the term in the level ("internal").
"Global": the number of occurrences of the term in the corpus.
"t value": the quantile of a normal distribution corresponding the probability "Prob.".
"Prob.": the probability of observing such an extreme (high or low) number of occurrences of the term in the level, under an hypergeometric distribution.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) specific_terms(dtm) specific_terms(dtm, meta(corpus)$Date)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) specific_terms(dtm) specific_terms(dtm, meta(corpus)$Date)
Split documents in a corpus into documents of one of more paragraphs.
split_documents(corpus, chunksize, preserveMetadata = TRUE)split_documents(corpus, chunksize, preserveMetadata = TRUE)
corpus |
A |
chunksize |
The number of paragraphs each new document should contain at most. |
preserveMetadata |
Whether to preserve the meta-data of original documents. |
A Corpus object with split documents.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") split_documents(corpus, 3)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") split_documents(corpus, 3)
Select documents containing (or not containing) one or more terms.
subset_corpus(corpus, dtm, terms, exclude = FALSE, all = FALSE)subset_corpus(corpus, dtm, terms, exclude = FALSE, all = FALSE)
corpus |
A |
dtm |
A |
terms |
One of more terms appearing in |
exclude |
Whether documents containing the terms should be excluded rather than retained. |
all |
Whether only documents containing all terms should be retained or excluded. By default, documents need to contain at least one of the terms. |
Corpus object.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) subset_corpus(corpus, dtm, "barrel") subset_corpus(corpus, dtm, c("barrel", "opec")) subset_corpus(corpus, dtm, c("barrel", "opec"), exclude=TRUE) subset_corpus(corpus, dtm, c("barrel", "opec"), all=TRUE)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) subset_corpus(corpus, dtm, "barrel") subset_corpus(corpus, dtm, c("barrel", "opec")) subset_corpus(corpus, dtm, c("barrel", "opec"), exclude=TRUE) subset_corpus(corpus, dtm, c("barrel", "opec"), all=TRUE)
Study frequencies of chosen terms in the corpus, among documents, or among levels of
term_freq(dtm, terms, variable = NULL, by_term = FALSE)term_freq(dtm, terms, variable = NULL, by_term = FALSE)
dtm |
A |
terms |
One or more reference term(s) appearing in |
variable |
An optional vector of values giving the groups for which most frequent terms should be reported. |
by_term |
Whether the third dimension of the array should be terms instead of levels. |
A list of matrices, one for each level of the variable, with columns:
"\ in documents where the chosen term is also present.
"\ where the chosen term is also present (rather than in documents where it does not appear), i.e. the percent of cooccurrences for the term..
"Global \ in the corpus (or in the subset of the corpus corresponding to the variable level).
"Level": the number of cooccurrences of the term.
"Global": the number of occurrences of the term in the corpus (or in the subset of the corpus corresponding to the variable level).
"t value": the quantile of a normal distribution corresponding the probability "Prob.".
"Prob.": the probability of observing such an extreme (high or low) number of occurrences of the term in documents where the chosen term is also present, under an hypergeometric distribution.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) term_freq(dtm, "barrel") term_freq(dtm, "barrel", meta(corpus)$Date)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) term_freq(dtm, "barrel") term_freq(dtm, "barrel", meta(corpus)$Date)
Plot a graph of terms.
terms_graph( dtm, n = 100, min_occ = 0, interactive = base::interactive(), vertex.label.cex = 1, ... )terms_graph( dtm, n = 100, min_occ = 0, interactive = base::interactive(), vertex.label.cex = 1, ... )
dtm |
A |
n |
The maximum number of terms to represent. |
min_occ |
The minimum number of occurrences for a term to be retained. |
interactive |
If |
vertex.label.cex |
The font size for vertex labels. It is interpreted as a multiplication factor of some device-dependent base font size. |
... |
Optional arguments passed to |
The ID of the plot returned by tkplot if interactive=TRUE,
or NULL invisibly otherwise.
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) terms_graph(dtm, 100, 3)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) terms_graph(dtm, 100, 3)
Plot a word cloud from a document-term matrix.
word_cloud(dtm, n = 50, remove_stopwords = TRUE, ...)word_cloud(dtm, n = 50, remove_stopwords = TRUE, ...)
dtm |
A |
n |
The maximum number of words to plot. |
remove_stopwords |
Whether to remove stopwords appearing in a language-specific list
(see |
... |
Additional arguments passed to |
file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) word_cloud(dtm)file <- system.file("texts", "reut21578-factiva.xml", package="tm.plugin.factiva") corpus <- import_corpus(file, "factiva", language="en") dtm <- build_dtm(corpus) word_cloud(dtm)