Sherlock Holmes
This weeks TidyTuesday data explored the complete line-by-line texts of the Sherlock Holmes stories and novels. The dataset includes the full version of texts, organized by book and line number.
As this is my first TidyTuesday, I wanted to try something simple. Therefore I evaluated the most common words across all texts and plotted them in a word cloud.
To make things a little more interesting, I shaped the word cloud in the form of Sherlock’s head, which I took from a book cover of The hound of Baskerville.
| Word | n |
|---|---|
| holmes | 2403 |
| time | 879 |
| sir | 846 |
| watson | 809 |
| house | 773 |
| night | 718 |
| door | 687 |
| hand | 649 |
| found | 570 |
| eyes | 553 |

#!/bin/Rscript
#######################
## LIBRARIES ##
#######################
# data processing
library(tidyverse)
library(tidytext)
# word cloud
library(ggwordcloud)
library(ggtext)
library(svgtools)
# output
library(patchwork)
##################
## DATA ##
##################
holmes <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-11-18/holmes.csv')
tokens <- holmes %>%
unnest_tokens(word, text) %>%
filter(!is.na(word))
tokens_clean <- tokens %>%
anti_join(stop_words, by="word")
word_count <- tokens_clean %>%
count(word, sort=TRUE) %>%
slice(1:150) %>%
mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(60, 40)))
####################
## COLORS ##
####################
fg_0 <- "bisque1"
fg_1 <- "bisque4"
bg <- "darkslategray"
#########################
## PLOT PARAMS ##
#########################
sherlock.png <- "assets/sherlock.png"
sherlock.svg <- "assets/sherlock.svg"
mask <- png::readPNG(sherlock.png)
image <- svgparser::read_svg(sherlock.svg, xoffset = 0, yoffset = 0)
plot_theme <- theme_void() +
theme(
plot.margin=unit(c(0,0,0,0), 'cm'),
panel.background = element_rect(fill='transparent'),
plot.background = element_rect(fill='transparent', color=NA)
)
pdf_theme <- theme_void() +
theme(
plot.margin=unit(c(1,1,1,1), 'cm'),
panel.background = element_rect(fill = bg, color = bg),
plot.background = element_rect(fill = bg, color = bg)
)
###################
## PLOTS ##
###################
set.seed(123)
# header and footer
df <- data.frame(x = 1:10, y = 1:10)
header <- ggplot(df, aes(x = x, y = y)) +
geom_blank() +
labs(
title = "Sherlock Holmes",
subtitle = "Most common words across all books"
) +
plot_theme +
theme(
plot.title = element_text(color = fg_0, size = 69, face = "bold"),
plot.subtitle = element_text(color = fg_1, size = 42, face = "bold")
)
# Word Cloud
cloud <- ggplot(
word_count,
aes(label = word, size = n,
color = n, angle = angle
)
) +
geom_text_wordcloud_area(
mask = mask,
rm_outside = TRUE
) +
scale_size_area(max_size = 69, trans = power_trans(1/.7)) +
scale_color_gradient(low = fg_1, high = fg_0) +
labs(caption = "#TidyTuesday") +
plot_theme +
theme(
plot.caption = element_markdown(color = fg_1, size = 25, face = "bold", lineheight = 1.2)
)
# Background image
bg_img <- ggplot(df, aes(x = x, y = y)) +
geom_blank() +
annotation_custom(image, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf) +
plot_theme
###########################
## OUTPUT PARAMS ##
###########################
# text_height = 1
text_height = 1
th <- text_height
# bg_x = 0.12
bg_x = 0.11
bg_y = 0.005
layout <- c(
area(t = 1, l = 1, b = 2, r = 6),
area(t = 1 + th, l = 1, b = 6 + th, r = 6),
area(t = 1 + th, l = 1, b = 6 + th, r = 6)
)
####################
## OUTPUT ##
####################
word_count
pdf("sherlock-cloud.pdf", height = 12 + ( 2 * th ), width = 12)
header /
( cloud + inset_element(bg_img,
left = bg_x, bottom = bg_y, right = 1 + bg_x, top = 1 + bg_y,
on_top = FALSE)
) /
plot_layout(design = layout, widths = 6, heights = c(th, 6, 6)) +
plot_annotation(theme = pdf_theme)
dev.off()