Webscraping Webtoon comments

Some things to note:

Using R Selenium to scrape Line WEBTOON comments

Reading in comments from the first episode:

Make sure you’ve already downloaded JAVA.

#driver = rsDriver(browser = c("chrome"))
driver <- rsDriver(browser=c("chrome"), chromever="72.0.3626.69")
remDr = driver[["client"]]

Overview of the process:

  1. grab all URLs
  2. map fxn onto URLS [comment_df = map_df(listofURLs, fxn)]

fxn (mapping):

fxn (scrape):

fxn (parse):

fxn (grab URLs):

Map function:

mapping_function = function(url){
    
  # these urls are just for testing purposes
  #url = "https://www.webtoons.com/en/challenge/tested/heck-of-a-start/viewer?title_no=231173&episode_no=1"
  
  #url = "https://www.webtoons.com/en/challenge/tested/happy-new-year/viewer?title_no=231173&episode_no=37"
  
  #navigate to URL
  remDr$navigate(url)
  
  Sys.sleep(2)
  
  #scrape
  df = scrape_comments()
  
  Sys.sleep(2)
  
  #navigate to page with other comments (if it exists)
  #this code may break if there are more than 2 comment pages
  other_comments <- remDr$findElements(using = 'css',  value = "[data-log = 'RPC.pglast']")
  
  while (length(other_comments) != 0) {

    other_comments[[1]]$clickElement()
    
    Sys.sleep(2)
    
    #if other comments page exists, scrape
    df2 = scrape_comments()
    
    df = bind_rows(df, df2)
    
    df = df %>% distinct(comment_txt, .keep_all = TRUE)

    other_comments <- remDr$findElements(using = 'css',  value = "[data-log = 'RPC.pglast']")
  } 
  
  #output the final df
  df
}

Scraping function

scrape_comments = function(){
  
  #find all reply buttons
  replies <- remDr$findElements(using = "class name",  value = "u_cbox_btn_reply")
  
  #open all reply buttons
  for (i in 1:length(replies)) {
    replies[[i]]$sendKeysToElement(list("laptops",key = "enter"))
    Sys.sleep(1)
  }

  #parse comments
  df_firstrepls = parse_comments(remDr$getPageSource()[[1]])
  
  #if there are more replies, click and scrape (need to figure out how to scrape only what is new)
  other_replies <- remDr$findElements(using = 'css',  value = "[data-log = 'RPC.rlast']")
  
  len = length(other_replies)
  
  while (len != 0) {
    for (j in 1:len) {
      other_replies[[j]]$sendKeysToElement(list("laptops",key = "enter"))
      Sys.sleep(1)
    }
    len = 0
  }
  
  Sys.sleep(2)
  
  df_morerepls = parse_comments(remDr$getPageSource()[[1]])
    
  df = bind_rows(df_firstrepls, df_morerepls)
  
  df = df %>% distinct(comment_txt, .keep_all = TRUE)
  
  #output final df
  df

}

Parse function

parse_comments = function(pagesrc){
  
  doc <- read_html(pagesrc)

  episode_num = doc %>%
    html_nodes(xpath = "//span[starts-with(@class,'tx')]") %>% 
    #html_nodes(".tx _btnOpenEpisodeList NPI=a:current,g:en_en") %>%
    html_text()
  
  episode = doc %>%
    html_nodes(".subj_episode") %>%
    html_text()
  
  comment_txt = doc %>%
    html_nodes(".u_cbox_contents") %>%
    html_text()
  
  username = doc %>%
    html_nodes(".u_cbox_nick") %>%
    html_text()
  
  likes = doc %>%
    html_nodes(".u_cbox_cnt_recomm") %>%
    html_text()
  
  likes_per_ep = doc %>%
    html_nodes("._likeCount") %>%
    html_text()
  
  reply <- doc %>% 
    html_nodes(xpath = "//li[starts-with(@class,'u_cbox_comment')]") %>% 
    html_attrs() %>%             
    # Extract the second element
    map(2) %>%  
    unlist() %>% 
    str_replace_all(",", " ") %>% 
    str_detect("replyLevel:2") 
  
  #add to dataframe
  data_frame(episode_num, episode, comment_txt, username, likes, reply, likes_per_ep)
  
}

Grabbing URLs function

grab_urls = function(first_url, home_page){
  
  #navigate to home page
  remDr$navigate(home_page)
  home_raw_page <- read_html(remDr$getPageSource()[[1]])
  list_of_eps = home_raw_page %>% 
    html_nodes(".tx") %>%
    html_text() 

  #extract number of last episode
  list_of_eps = list_of_eps %>% 
    str_replace_all("[[:punct:]]", "") 
  
  last_ep = as.numeric(list_of_eps[1])
  
  #initialize my list
  list_of_urls <- vector("list", last_ep)
  list_of_urls <- NULL
  
  #initialize the boundaries of my while loop
  beyond_last_ep = last_ep + 1
  i = 1
  
  Sys.sleep(5)
  
  #navigate to first url
  remDr$navigate(first_url)
  
  while (i < beyond_last_ep) {
    
    raw_page <- read_html(remDr$getPageSource()[[1]])
  
    Sys.sleep(2)
    
    #add url to list
    list_of_urls[[i]] = raw_page %>% 
      html_nodes(xpath = '//meta[@property="og:url"]') %>%
      html_attr('content')

    if (i != last_ep) {
      #find next page button
      next_page <- remDr$findElement(using = 'css',  value = "[class = 'pg_next _nextEpisode NPI=a:next,g:en_en']")
  
      #move to the next page
      next_page$clickElement()
    }
    
    i = i + 1
    
  }

  #output list
  list_of_urls
  
}

Putting it all together

first_url = "https://www.webtoons.com/en/challenge/tested/heck-of-a-start/viewer?title_no=231173&episode_no=1"

home_page = "https://www.webtoons.com/en/challenge/tested/list?title_no=231173"

#grab all the URLs of every episode
urls_list = grab_urls(first_url, home_page)

#scrape comments from every episode
comment_df = map_df(urls_list, mapping_function)

#write to a txt file
write.table(comment_df, file = "comments_202005102.txt", sep = ",", quote = TRUE, row.names = F)

#write to a csv
write.csv(comment_df,'comments_20200510.csv')

Closing webdriver

remDr$close()
driver$server$stop()
driver$server$process