Bryce Wong
May 14, 2019
First reading in the data (updated as of May 10, 2019 - this was run BEFORE episode #55 had been posted):
webtoons_data = read_csv(file = "./data/comments_june_12.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## episode_num = col_character(),
## episode = col_character(),
## comment_txt = col_character(),
## username = col_character(),
## likes = col_integer(),
## reply = col_logical(),
## likes_per_ep = col_integer()
## )
webtoons_data = webtoons_data %>%
filter(username != "TESTED @YGetIt on IG") %>%
select(-X1) %>%
mutate(
episode_num = str_replace(episode_num, "#", ""),
episode_num = as.numeric(episode_num),
season = ifelse(episode_num %in% 1:12, "1",
ifelse(episode_num %in% 13:24, "2",
ifelse(episode_num %in% 25:37, "3", "4")))
)
number_of_eps = webtoons_data %>%
distinct(episode, .keep_all = TRUE)
The total number of comments is 566.
The total number of episodes is 57.
Now getting the number of comments per each episode:
#number of comments per each episode
num_eps = webtoons_data %>%
count(episode) %>%
arrange(desc(n))
#outputting table of top 10 episodes by number of comments
num_eps %>%
top_n(10) %>%
rename(number_of_comments = n) %>%
knitr::kable(digits = 3)
## Selecting by n
episode | number_of_comments |
---|---|
WORLD AIDS DAY!!! | 26 |
Heck of a Start | 24 |
Brunchy Brunch | 20 |
Sometimes People SUCK!!! | 18 |
This Could Be Bad | 18 |
FIGHT!!!! | 17 |
HAPPY NEW YEAR!!!!! | 17 |
Doctor Visit | 16 |
Prayers | 16 |
Further South | 15 |
Now getting the number of likes per each comment:
#arranging comments by likes
arrange_by_likes = webtoons_data %>%
arrange(desc(likes))
#outputting table of top 10 comments by number of likes
head(arrange(webtoons_data, desc(likes)), 10)
## # A tibble: 10 x 8
## episode_num episode comment_txt userna~ likes reply likes_per_ep season
## <dbl> <chr> <chr> <chr> <int> <lgl> <int> <chr>
## 1 1 Heck o~ i love ham~ sub<U+~ 125 FALSE 545 1
## 2 1 Heck o~ omg is tha~ saphir~ 93 FALSE 545 1
## 3 1 Heck o~ Hamilton :~ swirli~ 82 FALSE 545 1
## 4 40 Soluti~ I'm glad s~ frowsy 66 FALSE 221 4
## 5 23 This C~ SHE HAD ON~ GrimmZ~ 65 FALSE 248 2
## 6 30 You Ju~ Clearly ta~ coyowo~ 58 FALSE 251 3
## 7 6 Brunch~ wait what.~ happyc~ 57 FALSE 312 1
## 8 50 Tragedy "This one ~ pompou~ 54 FALSE 152 4
## 9 6 Brunch~ There will~ gillea~ 51 FALSE 312 1
## 10 32 WORLD ~ honestly t~ just y~ 51 FALSE 291 3
Now getting the number of comments per each unique user:
#number of comments each unique user has posted
num_users = webtoons_data %>%
count(username) %>%
arrange(desc(n))
#outputting table of top 10 users by number of comments
#cannot output as a nice table, possibly because a user has UTF8 characters in their name
num_users %>%
top_n(10) %>%
rename(number_of_comments = n)
## Selecting by n
## # A tibble: 12 x 2
## username number_of_comments
## <chr> <int>
## 1 gilleanfryingpan 45
## 2 AoiYeyi 22
## 3 sausage172000 22
## 4 happycat(: 18
## 5 19danny15 16
## 6 Cheapthrill_Xo 16
## 7 CopperMortar 14
## 8 "\xb0\x95Mariella\x95\xb0" 13
## 9 catberra 12
## 10 "Johnna\x92s Way" 12
## 11 Mugly Mae 12
## 12 RedtheGreyFox 12
#stats of likes per episode (likes of episode - NOT comments)
ep_likes = webtoons_data %>%
distinct(episode, .keep_all = TRUE)
#removing other columns
ep_likes = ep_likes %>%
select(episode, likes_per_ep)
#outputting table of top 10 comments by number of likes
head(arrange(ep_likes, desc(likes_per_ep)), 10) %>%
knitr::kable(digits = 3)
episode | likes_per_ep |
---|---|
Heck of a Start | 545 |
Uh oh | 439 |
Flash Back | 373 |
Doctor Visit | 342 |
Work It Out | 338 |
Brunchy Brunch | 312 |
WORLD AIDS DAY!!! | 291 |
Rough Start | 282 |
It Goes Down in the Bathroom | 270 |
Ape S#$% | 265 |
Now a bunch of tables showing basic summary statistics for:
Also, one histogram at the end to show the distribution of likes.
(The histogram of the distribution of number of comments per episode was a bit funky and probably not worth viewing)
#stats of comments across all episodes
avg_num_comm = num_eps %>%
summarize(mean_comments_per_ep = mean(n),
median_comments_per_ep = median(n),
sd_comments = sd(n)) %>%
knitr::kable(digits = 3)
avg_num_comm
mean_comments_per_ep | median_comments_per_ep | sd_comments |
---|---|---|
9.93 | 8 | 5.321 |
#stats of commentators
avg_user = num_users %>%
summarize(mean_comments_per_user = mean(n),
median_comments_per_user = median(n),
sd_comments = sd(n)) %>%
knitr::kable(digits = 3)
avg_user
mean_comments_per_user | median_comments_per_user | sd_comments |
---|---|---|
2.903 | 1 | 4.786 |
#stats of likes
avg_likes = webtoons_data %>%
summarize(mean_likes_per_comment = mean(likes),
median_likes_per_comment = median(likes),
sd_likes = sd(likes)) %>%
knitr::kable(digits = 3)
avg_likes
mean_likes_per_comment | median_likes_per_comment | sd_likes |
---|---|---|
7.528 | 4 | 11.785 |
#stats of total comment likes
avg_total_likes = webtoons_data %>%
group_by(episode) %>%
summarise(likes = sum(likes)) %>%
summarize(mean_total_likes = mean(likes),
median_total_likes = median(likes),
sd_total_likes = sd(likes)) %>%
knitr::kable(digits = 3)
avg_total_likes
mean_total_likes | median_total_likes | sd_total_likes |
---|---|---|
74.754 | 57 | 66.136 |
#stats of likes per episode (likes of episode - NOT comments)
avg_likes_per_ep = webtoons_data %>%
distinct(episode, .keep_all = TRUE) %>%
summarize(mean_likes_per_ep = mean(likes_per_ep),
median_likes_per_ep = median(likes_per_ep),
sd_likes = sd(likes_per_ep)) %>%
knitr::kable(digits = 3)
avg_likes_per_ep
mean_likes_per_ep | median_likes_per_ep | sd_likes |
---|---|---|
231.877 | 245 | 78.863 |
#visualizations
#distribution of likes
ggplot(webtoons_data, aes(x = likes)) +
geom_histogram() +
labs(
title = "Distribution of likes per comment",
x = "Number of likes",
y = "Count"
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Creating a variable that organizes the episodes by season - note that this code will not be extendable to organizing future episodes by season (there is no "season" identification marker built into the way the episodes were uploaded to Webtoons).
The assignment of an episode to a specific season had to be done by hand, by taking a look at the titles and figuring out which season they belong to.
Note that not all seasons have the same number of episodes (some seasons have filler episodes) - thus, the average number of likes and comments per an episode in each season have also been calculated.
seasons = webtoons_data %>%
select(season, episode_num, episode, comment_txt, likes_per_ep)
#total episode likes per season
likes_per_season = seasons %>%
distinct(episode_num, .keep_all = TRUE) %>%
group_by(season) %>%
summarize(total_likes = sum(likes_per_ep))
likes_per_season %>%
knitr::kable(digits = 3)
season | total_likes |
---|---|
1 | 3918 |
2 | 2982 |
3 | 3176 |
4 | 3141 |
#visualization
ggplot(likes_per_season, aes(x = season, y = total_likes, fill = season)) + geom_bar(stat = 'identity')
#avg likes per episode by season
avg_ep_likes_season = seasons %>%
distinct(episode_num, .keep_all = TRUE) %>%
group_by(season) %>%
summarize(avg_ep_likes = mean(likes_per_ep))
avg_ep_likes_season %>%
knitr::kable(digits = 3)
season | avg_ep_likes |
---|---|
1 | 326.500 |
2 | 248.500 |
3 | 244.308 |
4 | 157.050 |
#visualization
ggplot(avg_ep_likes_season, aes(x = season, y = avg_ep_likes, fill = season)) + geom_bar(stat = 'identity')
#total comments per season
comments_per_season = seasons %>%
count(season) %>%
rename(total_comments = n)
comments_per_season %>%
knitr::kable(digits = 3)
season | total_comments |
---|---|
1 | 132 |
2 | 128 |
3 | 159 |
4 | 147 |
#visualization
ggplot(comments_per_season, aes(x = season, y = total_comments, fill = season)) + geom_bar(stat = 'identity')
#avg comments per episode by season
avg_ep_comments_season = seasons %>%
count(season, episode) %>%
group_by(season) %>%
summarize(avg_ep_comments = mean(n))
avg_ep_comments_season %>%
knitr::kable(digits = 3)
season | avg_ep_comments |
---|---|
1 | 11.000 |
2 | 10.667 |
3 | 12.231 |
4 | 7.350 |
#visualization
ggplot(avg_ep_comments_season, aes(x = season, y = avg_ep_comments, fill = season)) + geom_bar(stat = 'identity')
#assigning "filler" label to filler episodes
filler_data = seasons %>%
mutate(
filler = ifelse(episode_num %in% 36:38, "yes",
ifelse(episode_num == 32, "yes",
ifelse(episode_num == 54:57, "yes", "no")))
)
## Warning in episode_num == 54:57: longer object length is not a multiple of
## shorter object length
#average comments for filler episodes versus for non-filler episodes
avg_ep_comments_filler = filler_data %>%
count(season, episode_num, filler) %>%
group_by(filler) %>%
summarize(avg_ep_comments = mean(n))
avg_ep_comments_filler %>%
knitr::kable(digits = 3)
filler | avg_ep_comments |
---|---|
no | 9.472 |
yes | 9.143 |
#visualization
ggplot(avg_ep_comments_filler, aes(x = filler, y = avg_ep_comments, fill = filler)) + geom_bar(stat = 'identity')
#average likes for filler episodes versus for non-filler episodes
avg_ep_likes_filler = filler_data %>%
distinct(episode_num, .keep_all = TRUE) %>%
group_by(filler) %>%
summarize(avg_ep_likes = mean(likes_per_ep))
avg_ep_likes_filler %>%
knitr::kable(digits = 3)
filler | avg_ep_likes |
---|---|
no | 235.077 |
yes | 198.600 |
#visualization
ggplot(avg_ep_likes_filler, aes(x = filler, y = avg_ep_likes, fill = filler)) + geom_bar(stat = 'identity')
#average comments by season, filler vs non-filler
avg_comments_filler_season = filler_data %>%
count(season, episode_num, filler) %>%
group_by(season, filler) %>%
summarize(avg_ep_comments = mean(n))
avg_comments_filler_season %>%
knitr::kable(digits = 3)
season | filler | avg_ep_comments |
---|---|---|
1 | no | 11.000 |
2 | no | 10.667 |
3 | no | 10.600 |
3 | yes | 17.667 |
4 | no | 7.158 |
4 | yes | 2.750 |
#visualization
ggplot(avg_comments_filler_season, aes(x = season, y = avg_ep_comments, fill = filler)) + geom_bar(stat = 'identity')
#average likes by season, filler vs non-filler
avg_likes_filler_season = filler_data %>%
distinct(episode_num, .keep_all = TRUE) %>%
group_by(season, filler) %>%
summarize(avg_ep_likes = mean(likes_per_ep))
avg_likes_filler_season %>%
knitr::kable(digits = 3)
season | filler | avg_ep_likes |
---|---|---|
1 | no | 326.500 |
2 | no | 248.500 |
3 | no | 249.400 |
3 | yes | 227.333 |
4 | no | 157.222 |
4 | yes | 155.500 |
#visualization
ggplot(avg_likes_filler_season, aes(x = season, y = avg_ep_likes, fill = filler)) + geom_bar(stat = 'identity')
Be aware that the labelling of health messaging episodes is hardcoded - to update this code for later scraping, you need to update the episodes included as health messaging.
#assigning "health" label to health messaging episodes
health_data = seasons %>%
mutate(
health = ifelse(episode_num %in% 54:56, "yes",
ifelse(episode_num == 51, "yes",
ifelse(episode_num == 32, "yes", "no")))
)
#average comments for health messaging episodes versus for all other episodes
avg_ep_comments_health = health_data %>%
count(season, episode_num, health) %>%
group_by(health) %>%
summarize(avg_ep_comments = mean(n))
avg_ep_comments_health %>%
knitr::kable(digits = 3)
health | avg_ep_comments |
---|---|
no | 10.173 |
yes | 7.400 |
#visualization
ggplot(avg_ep_comments_health, aes(x = health, y = avg_ep_comments, fill = health)) + geom_bar(stat = 'identity')
#average likes for health messaging episodes versus for all other episodes
avg_ep_likes_health = health_data %>%
distinct(episode_num, .keep_all = TRUE) %>%
group_by(health) %>%
summarize(avg_ep_likes = mean(likes_per_ep))
avg_ep_likes_health %>%
knitr::kable(digits = 3)
health | avg_ep_likes |
---|---|
no | 239.192 |
yes | 155.800 |
#visualization
ggplot(avg_ep_likes_health, aes(x = health, y = avg_ep_likes, fill = health)) + geom_bar(stat = 'identity')
Note: a lot of the code here is adapted from Jeff Goldsmith's TidyText lecture.
webtoon_comments =
webtoons_data %>%
mutate(comment_num = row_number(),
like_category = cut(likes, breaks = c(-Inf, 4, 10, Inf),
labels = c("low","middle","high"))) %>%
as_tibble()
data(stop_words)
comment_words =
webtoon_comments %>%
unnest_tokens(word, comment_txt) %>%
anti_join(stop_words)
## Joining, by = "word"
comment_word_sentiments <- comment_words %>%
inner_join(get_sentiments("bing")) %>%
count(comment_num, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
left_join(webtoon_comments)
## Joining, by = "word"
## Joining, by = "comment_num"
ggplot(comment_word_sentiments,
aes(x = reorder(comment_num, -sentiment),
y = sentiment, fill = like_category, color = like_category)) +
geom_bar(stat = "identity") +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
Most positive comment:
comment_word_sentiments %>%
filter(sentiment == max(sentiment)) %>%
pull(comment_txt)
## [1] "Thank you so much notgaybutnot straight thank you for listening so wish I can say everything else had a happy ending but it<U+0092>s okay because people like you make this life thing so much easier esp with the love and support happy New Years to you and yours may god bless you in a bundance<U+2728>"
Most negative comment:
comment_word_sentiments %>%
filter(sentiment == min(sentiment)) %>%
pull(comment_txt)
## [1] "When I say all of us I mean yes I was an addict and still am I have an addictive personality thingy, (so I<U+0092>m not living in active addiction anymore) i wish I could like sacrifice myself for all the addicts hurting abused children, & the animals ever poor baby that<U+0092>s getting abused or is sad or anything but sadly I<U+0092>m nobody and I can<U+0092>t change any of it but maybe if I think like this maybe I<U+0092>m not the only one and maybe all of us who feel this way can get together and maybe help all those who are"
Interestingly, cannot find the text for the comment with the lowest/highest sentiment in a specific like_category - something to look into in the future.
Exporting a text file of the comments (commented out for now):
#just_comments = webtoons_data %>%
#filter(username != "TESTED @YGetIt on IG") %>%
#select(comment_txt)
#write.table(just_comments, file = "just_comments.txt", sep = ",", quote = TRUE, row.names = F)