repl.it
@TonySiu/

R denver neighborhoods

R

No description

fork
loading
Files
  • main.r
main.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#load required packages
#require(c("ggplot2", "e1071", "caret", "quanteda", "irlba", "randomGorest"))
install.packages(c("ggplot2", "e1071", "caret", "qw", "irlba", "randomGorest", "tm", "wordcloud"))

#Load up the .CSV data and explore in RStudio #denver_listings <- read.csv("C:\\Users\\the\\Desktop\\denver_listings.csv", stringsAsFactors=FALSE)
reviews <- read.csv("denver_listings.csv", stringsAsFactors = FALSE)
View(reviews)

#Create subset to view only the following columns: ‘id’,’description’,’neighbourhood_overview, ‘neighbourhood’, “amenities”, ‘number_of_reviews_ltm’, review_scores_rating
reviews <- reviews[, c("description", "neighbourhood", "neighborhood_overview", "id", "amenities","number_of_reviews_ltm", "review_scores_rating" )]

#Filter listings to only include Jefferson Park in “neighbourhood” column
subset(denver_listings,neighbourhood_cleansed=="Jefferson Park", select= c("id","description","amenities","neighborhood_overview", "neighbourhood_cleansed", "number_of_reviews_ltm", "review_scores_rating"))

#Eliminate listings with 0 value in number_of_reviews_ltm
review <- review[!is.na(review)]
length(which(!complete.cases(reviews))
#Use text mining package/stopwords package to rank words most frequently used in “amenities” 
#Use text mining package/stopwords package to rank words most frequently used in “description” 
#Use text mining package/stopwords package to rank words most frequently used in neighborhood_overview” to answer Question #2
install.packages('tm',)
library(tm)
reviews <- read.csv("denver_listings.csv", stringsAsFactors = FALSE)

review_text1 <- paste(reviews$neighborhood_overview, collapse=" ")
review_text2 <- paste(reviews$amenities, collapse=" ")
review_text3 <- paste(reviews$description, collapse=" ")


review_source <- VectorSource(review_text1) #review_text2 #review_text3
corpus <- Corpus(review_source)

corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("english"))

dtm <- DocumentTermMatrix(corpus)
dtm2 <- as.matrix(dtm)

frequency <- colSums(dtm2)
frequency <- sort(frequency, decreasing=TRUE)
frequency <- sort(frequency[1:100], decreasing=TRUE)

install.packages('wordcloud')
library(wordcloud)
words <- names(frequency)
wordcloud(words[1:100], frequency[1:100])
barplot(frequency[1:15])

# I need word analysis on most frequent words used in "amenities" and "description"- histogram would be helpful.
    reviews <- read.csv("denver_listings.csv", stringsAsFactors = FALSE)  
    reviews <- reviews[, c("review_scores_rating", "amenities", "description" )]
    reviews <- as.factor(reviews$review_scores_rating)
    reviews$Textlength <- nchar(reviews.description)
    reviews$Textlength1 <- nchar(reviews.amenities)
    summary(reviews$Textlength)
       

       
library(ggplot2)

ggplot(Tony.raw, aes(x=TextLength, fill = neighbourhood)) +
  theme_bw() +
  geom_histogram(binwidth = 5) +
  labs(y = "Text Count", x = "Length of Text",
       title = "Distribution of Text Lengths with class Labels")