Count unique words

Below is some R code that could be employed to summarize the unique words from the text.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
words <- read.table(file.choose(), header = FALSE,fill = TRUE) # import txt file

words <- apply(words,c(1,2),function(x) gsub("[[:punct:]]", "", x)) # remove all special characters

words <- words[(words !='') ] # remove all NA content

x_numbers <- unlist(regmatches(words, gregexpr("[[:digit:]]+", words))) # extract numbers from string

words <- gsub('[[:digit:]]+', '', words) # remove all content containing numbers

words <- words[(words !='') ] # remove all NA content

x_numbers <- unlist(regmatches(words, gregexpr("[[:digit:]]+", words))) # extract numbers from string

length(unique(words)) # count unique words