Some things to note:
Will be following ethical principles as outlined here.
Finally, I have looked at Webtoon’s robots.txt and it looks like I should be able to scrape comments.
Reading in comments from the first episode:
Make sure you’ve already downloaded JAVA.
#driver = rsDriver(browser = c("chrome"))
driver <- rsDriver(browser=c("chrome"), chromever="72.0.3626.69")
remDr = driver[["client"]]
fxn (mapping):
search for 2nd page of comments
fxn (scrape):
fxn (parse):
fxn (grab URLs):
mapping_function = function(url){
# these urls are just for testing purposes
#url = "https://www.webtoons.com/en/challenge/tested/heck-of-a-start/viewer?title_no=231173&episode_no=1"
#url = "https://www.webtoons.com/en/challenge/tested/happy-new-year/viewer?title_no=231173&episode_no=37"
#navigate to URL
remDr$navigate(url)
Sys.sleep(2)
#scrape
df = scrape_comments()
Sys.sleep(2)
#navigate to page with other comments (if it exists)
#this code may break if there are more than 2 comment pages
other_comments <- remDr$findElements(using = 'css', value = "[data-log = 'RPC.pglast']")
while (length(other_comments) != 0) {
other_comments[[1]]$clickElement()
Sys.sleep(2)
#if other comments page exists, scrape
df2 = scrape_comments()
df = bind_rows(df, df2)
df = df %>% distinct(comment_txt, .keep_all = TRUE)
other_comments <- remDr$findElements(using = 'css', value = "[data-log = 'RPC.pglast']")
}
#output the final df
df
}
scrape_comments = function(){
#find all reply buttons
replies <- remDr$findElements(using = "class name", value = "u_cbox_btn_reply")
#open all reply buttons
for (i in 1:length(replies)) {
replies[[i]]$sendKeysToElement(list("laptops",key = "enter"))
Sys.sleep(1)
}
#parse comments
df_firstrepls = parse_comments(remDr$getPageSource()[[1]])
#if there are more replies, click and scrape (need to figure out how to scrape only what is new)
other_replies <- remDr$findElements(using = 'css', value = "[data-log = 'RPC.rlast']")
len = length(other_replies)
while (len != 0) {
for (j in 1:len) {
other_replies[[j]]$sendKeysToElement(list("laptops",key = "enter"))
Sys.sleep(1)
}
len = 0
}
Sys.sleep(2)
df_morerepls = parse_comments(remDr$getPageSource()[[1]])
df = bind_rows(df_firstrepls, df_morerepls)
df = df %>% distinct(comment_txt, .keep_all = TRUE)
#output final df
df
}
parse_comments = function(pagesrc){
doc <- read_html(pagesrc)
episode_num = doc %>%
html_nodes(xpath = "//span[starts-with(@class,'tx')]") %>%
#html_nodes(".tx _btnOpenEpisodeList NPI=a:current,g:en_en") %>%
html_text()
episode = doc %>%
html_nodes(".subj_episode") %>%
html_text()
comment_txt = doc %>%
html_nodes(".u_cbox_contents") %>%
html_text()
username = doc %>%
html_nodes(".u_cbox_nick") %>%
html_text()
likes = doc %>%
html_nodes(".u_cbox_cnt_recomm") %>%
html_text()
likes_per_ep = doc %>%
html_nodes("._likeCount") %>%
html_text()
reply <- doc %>%
html_nodes(xpath = "//li[starts-with(@class,'u_cbox_comment')]") %>%
html_attrs() %>%
# Extract the second element
map(2) %>%
unlist() %>%
str_replace_all(",", " ") %>%
str_detect("replyLevel:2")
#add to dataframe
data_frame(episode_num, episode, comment_txt, username, likes, reply, likes_per_ep)
}
grab_urls = function(first_url, home_page){
#navigate to home page
remDr$navigate(home_page)
home_raw_page <- read_html(remDr$getPageSource()[[1]])
list_of_eps = home_raw_page %>%
html_nodes(".tx") %>%
html_text()
#extract number of last episode
list_of_eps = list_of_eps %>%
str_replace_all("[[:punct:]]", "")
last_ep = as.numeric(list_of_eps[1])
#initialize my list
list_of_urls <- vector("list", last_ep)
list_of_urls <- NULL
#initialize the boundaries of my while loop
beyond_last_ep = last_ep + 1
i = 1
Sys.sleep(5)
#navigate to first url
remDr$navigate(first_url)
while (i < beyond_last_ep) {
raw_page <- read_html(remDr$getPageSource()[[1]])
Sys.sleep(2)
#add url to list
list_of_urls[[i]] = raw_page %>%
html_nodes(xpath = '//meta[@property="og:url"]') %>%
html_attr('content')
if (i != last_ep) {
#find next page button
next_page <- remDr$findElement(using = 'css', value = "[class = 'pg_next _nextEpisode NPI=a:next,g:en_en']")
#move to the next page
next_page$clickElement()
}
i = i + 1
}
#output list
list_of_urls
}
first_url = "https://www.webtoons.com/en/challenge/tested/heck-of-a-start/viewer?title_no=231173&episode_no=1"
home_page = "https://www.webtoons.com/en/challenge/tested/list?title_no=231173"
#grab all the URLs of every episode
urls_list = grab_urls(first_url, home_page)
#scrape comments from every episode
comment_df = map_df(urls_list, mapping_function)
#write to a txt file
write.table(comment_df, file = "comments_202005102.txt", sep = ",", quote = TRUE, row.names = F)
#write to a csv
write.csv(comment_df,'comments_20200510.csv')
remDr$close()
driver$server$stop()
driver$server$process