13일차 - 4월 27일

네이버 영화 리뷰 추출

#패키지 로딩
#install.packages("rvest")
library(rvest)

#웹문서 다운로드
url <- "https://movie.naver.com/movie/point/af/list.nhn"
html <- read_html(url)
html

#리뷰 셀 추출
review_cell<-html_nodes(html , "#old_content table tr .title")
review_cell

#평점 추출
score <- html_nodes(review_cell , "em") %>%
html_text()
score

#리뷰 추출
review <-review_cell %>%
html_text()
review

#리뷰 데이터 정제
# (1)리뷰 앞 공통부분이 있는 위치
index.start <- regexpr("\t별점 -", review)
index.start

# (1)리뷰 뒤 공통부분이 있는 위치
index.end<-regexpr("\t신고", review)
index.end

#(2) 리뷰 추출
review <-substring( review , index.start , index.end)
review

review <- substring(review, 16)
review

#(3) 제어문자 제거 (제어문자를 공백으로 대체)
review <- gsub("[\n | \t]", "", review)
review

#(4) 리뷰 좌우 공백 제거
review <- trimws (review, "both")
review

#15세관람가가 몇개가 있는지 분포 파악, 영화제목 추출

url <- "https://movie.naver.com/movie/running/current.nhn"
html <- read_html(url)
html
title <- html_nodes(html, '#content li .tit')
title

title1 <- html_nodes(title, "a") %>% html_text()

title1

age<-html_nodes(title, "span") %>%

html_text()

age

year1<-html_nodes(html,

"#container li .tit")

year<- ifelse(grepl("<span",

x),html_nodes(year1, "span")

%>% html_text(), "NA")

barplot(table(year))

과제

혼자 푼 거라 정답과 거리가 멀 수도 있으므로 유의

library(readxl)
#카페에서 가장 많이 판매한 메뉴 확인하기
cafe <- read_excel("C:\\rstudy\\day12\\data\\Cafe_Sales.xlsx")
View(cafe)
max(table(cafe$item))

#월별 요일별 계절별
library(lubridate)
month <- month(cafe$order_date)
month
day <- wday(cafe$order_date, label = TRUE)
day
season <- quarter(cafe$order_date)
season

#요일별로 판매한 메뉴 확인하기
a <- cafe %>% group_by(wday(cafe$order_date, label = TRUE)) %>% summarise(item)
View(a)
#계절별로 판매한 메뉴 확인하기(계절-4분기로 나눔)
b <- cafe %>% group_by(quarter(cafe$order_date)) %>% summarise(item)
View(b)
ggplot(cafe, mapping=aes(x=item, y=price))+theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5 ))+geom_point()
#월별 판매건수
ggplot(cafe, mapping=aes(x=item, y=price))+geom_col(mapping=aes(fill=month(order_date)))+theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5 ))
#요일별 판매건수
ggplot(cafe, mapping=aes(x=item, y=price))+geom_col(mapping=aes(fill=wday(order_date,label = TRUE)))+theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5 ))
#카테고리별 판매건수
ggplot(cafe, mapping=aes(x=item, y=price))+geom_col(mapping=aes(fill=category))+theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5 ))

plot까지 깔끔하게 나오는 코드를 수업 같이 듣는 분에게 받게 되어 첨부함

#가장 많이 판매한 메뉴
head(sort(table(cafe$item), decreasing = TRUE), 1)

library(dplyr)
cafe["day"] <- weekdays(cafe$order_date)
head(cafe,10)

table(cafe$day)

weekdays(1)
#요일
tmp <- distinct(cafe, day)
tmp <- tmp[!is.na(tmp),]
tmp$day

a <- filter(cafe, day == tmp$day[1])
a

library(ggplot2)
barplot(sort(table(a$item), decreasing = TRUE), las = 2)

day_data <- c(NULL, NULL, NULL, NULL, NULL, NULL, NULL)
day_data <- table(day_data)
par(mfrow=c(3,2))
str_days = c("월요일", "화요일", "수요일", "목요일", "금요일", "토요일", "일요일")
i <- 1
for(i in 1:7) {
  tmp <- filter(cafe, day == tmp$day[i])
  barplot(sort(table(tmp$item), decreasing = TRUE), las = 2)
}

tmp_plot <- filter(cafe, day == tmp$day[1])
barplot(sort(table(tmp_plot$item), decreasing = TRUE), las = 2, main = tmp$day[1])

tmp_plot2 <- filter(cafe, day == tmp$day[2])
barplot(sort(table(tmp_plot$item), decreasing = TRUE), las = 2, main = tmp$day[2])

tmp_plot3 <- filter(cafe, day == tmp$day[3])
barplot(sort(table(tmp_plot$item), decreasing = TRUE), las = 2, main = tmp$day[3])

tmp_plot4 <- filter(cafe, day == tmp$day[4])
barplot(sort(table(tmp_plot$item), decreasing = TRUE), las = 2, main = tmp$day[4])

tmp_plot5 <- filter(cafe, day == tmp$day[5])
barplot(sort(table(tmp_plot$item), decreasing = TRUE), las = 2, main = tmp$day[5])

tmp_plot6 <- filter(cafe, day == tmp$day[6])
barplot(sort(table(tmp_plot$item), decreasing = TRUE), las = 2, main = tmp$day[6])

tmp_plot7 <- filter(cafe, day == tmp$day[7])
barplot(sort(table(tmp_plot$item), decreasing = TRUE), las = 2, main = tmp$day[7])

# 계절별 판매 매뉴
cafe$month <- as.numeric(format(cafe$order_date, "%m"))
cafe$season<-"winter"
cafe$season[cafe$month>=3&cafe$month<6]<-"spring"
cafe$season[cafe$month>=6&cafe$month<9]<-"summer"
cafe$season[cafe$month>=9&cafe$month<12]<-"autumn"
cafe$season <- factor(cafe$season)
summary(cafe$season)
cafe_sales_season <- cafe |> group_by(season, item) |> summarise(sales_amt = n())
ggplot(cafe_sales_season, aes(x=item, y=sales_amt)) +
  geom_bar(position="dodge", stat="identity") +
  facet_wrap(~season) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

# 카테고리 별 판매 ##
qplot(cafe$category)

# 월별 판매
sales_month <- cafe |> group_by(month) |> summarise(n=n())
ggplot(sales_month, aes(x=as.factor(month), y=n)) +
  geom_col()

# 요일별 판매 건수
sales_day <- cafe |> group_by(day) |> summarise(n=n())
ggplot(sales_day, aes(x=day, y=n)) +
  geom_col()

#raster 패키지를 이용하여 대한민국 지도 그리기
install.packages("rgdal")
library(raster)

library(rgdal)
# 국가
korea = getData(name = "GADM",
                country = "kor",
                level = 0)

# 시도
korea_sido = getData(name = "GADM",
                     country = "kor",
                     level = 1)

# 시군구
korea_sigungu = getData(name = "GADM",
                        country = "kor",
                        level = 2)

korea$GID_0
korea$NAME_0
korea_sido$GID_1
korea_sido$NAME_1
korea_sido$VARNAME_1
korea_sido$NL_NAME_1
korea_sido$TYPE_1
korea_sido$ENGTYPE_1
korea_sido$HASC_1

seoul = korea_sigungu[korea_sigungu$NAME_1=="Seoul",]
seoul$GID_2
seoul$NAME_2
seoul$NL_NAME_2
seoul$TYPE_2
seoul$ENGTYPE_2

# 국가
p1 = ggplot(korea) +
  geom_polygon(aes(x = long, y = lat, group = group),
               fill = "white", color = "black") +
  labs(title = "Korea") +
  theme(axis.ticks = element_blank(),
        axis.title = element_blank(),
        axis.text = element_blank())
p1

# 시도
p2 = ggplot(korea_sido) +
  labs(title = "Sido") +
  geom_polygon(aes(x = long, y = lat, group = group),
               fill = "white", color = "black") +
  theme(axis.ticks = element_blank(),
        axis.title = element_blank(),
        axis.text = element_blank())
p2

# 시군구
p3 = ggplot(korea_sigungu) +
  geom_polygon(aes(x = long, y = lat, group = group),
               fill = "white", color = "black") +
  labs(title = "Sigungu") +
  theme(axis.ticks = element_blank(),
        axis.title = element_blank(),
        axis.text = element_blank())
p3

library(readxl)
abtest <- read_excel("C:\\rstudy\\day12\\data\\abtest.xlsx")
View(abtest)
#귀무가설: 광고이메일을 열어본 횟수와 클릭 후 구매 전환 횟수가 상관이 없다, 대립가설:상관이 있다.
cor.test(abtest$open, abtest$conversion)

Pearson's product-moment correlation

data: abtest$open and abtest$conversion
t = 134.71, df = 43966, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5338593 0.5470924
sample estimates:
cor
0.5405093

#귀무가설: 광고를 클릭해본 횟수와 클릭 후 구매 전환 횟수가 상관이 없다, 대립가설:상관이 있다.
cor.test(abtest$click, abtest$conversion)

Pearson's product-moment correlation

data: abtest$click and abtest$conversion
t = 289.38, df = 43966, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8065303 0.8129666
sample estimates:
cor
0.8097728

#귀무가설: 광고이메일을 열어본 횟수와 광고를 클릭해본 횟수 둘 다 했을 경우와 클릭 후 구매 전환 횟수가 상관이 없다, 대립가설:상관이 있다.
cor.test(abtest$open+abtest$click, abtest$conversion)

Pearson's product-moment correlation

data: abtest$open + abtest$click and abtest$conversion
t = 137.62, df = 43966, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5421291 0.5551956
sample estimates:
cor
0.5486959

모두 p-value 값이 0.05보다 작으므로 귀무가설을 기각할 것임.

상관계수도 꽤 혹은 다소 양호함도 확인할 수 있음

공부 일지

13일차 - 4월 27일

티스토리툴바