Sherlock Holmes

2025-11-18

This weeks TidyTuesday data explored the complete line-by-line texts of the Sherlock Holmes stories and novels. The dataset includes the full version of texts, organized by book and line number.

As this is my first TidyTuesday, I wanted to try something simple. Therefore I evaluated the most common words across all texts and plotted them in a word cloud.

To make things a little more interesting, I shaped the word cloud in the form of Sherlock’s head, which I took from a book cover of The hound of Baskerville.

Word	n
holmes	2403
time	879
sir	846
watson	809
house	773
night	718
door	687
hand	649
found	570
eyes	553

Sherlock Holmes shaped Word Cloud representing the most common words across all stories

#!/bin/Rscript

#######################
##     LIBRARIES     ##
#######################

# data processing
library(tidyverse)
library(tidytext)

# word cloud
library(ggwordcloud)
library(ggtext)
library(svgtools)

# output
library(patchwork)

##################
##     DATA     ##
##################

holmes <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-11-18/holmes.csv')

tokens <- holmes %>%
    unnest_tokens(word, text) %>%
    filter(!is.na(word))

tokens_clean <- tokens %>%
    anti_join(stop_words, by="word")

word_count <- tokens_clean %>%
    count(word, sort=TRUE) %>%
    slice(1:150) %>%
    mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(60, 40)))

####################
##     COLORS     ##
####################

fg_0 <- "bisque1"
fg_1 <- "bisque4"

bg <- "darkslategray"

#########################
##     PLOT PARAMS     ##
#########################

sherlock.png <- "assets/sherlock.png"
sherlock.svg <- "assets/sherlock.svg"

mask <- png::readPNG(sherlock.png)
image <- svgparser::read_svg(sherlock.svg, xoffset = 0, yoffset = 0)

plot_theme <- theme_void() +
    theme(
        plot.margin=unit(c(0,0,0,0), 'cm'),
        panel.background = element_rect(fill='transparent'),
        plot.background = element_rect(fill='transparent', color=NA)
    )

pdf_theme <- theme_void() +
    theme(
        plot.margin=unit(c(1,1,1,1), 'cm'),
        panel.background = element_rect(fill = bg, color = bg),
        plot.background  = element_rect(fill = bg, color = bg)
    )

###################
##     PLOTS     ##
###################

set.seed(123)

# header and footer
df <- data.frame(x = 1:10, y = 1:10)
header <- ggplot(df, aes(x = x, y = y)) +
  geom_blank() +
  labs(
        title = "Sherlock Holmes",
        subtitle = "Most common words across all books"
        ) +
  plot_theme +
  theme(
    plot.title    = element_text(color = fg_0, size = 69, face = "bold"),
    plot.subtitle = element_text(color = fg_1, size = 42, face = "bold")
  )

# Word Cloud
cloud <- ggplot(
        word_count,
        aes(label = word, size = n,
            color = n, angle = angle
        )
    ) +
    geom_text_wordcloud_area(
        mask = mask,
        rm_outside = TRUE
    ) +
    scale_size_area(max_size = 69, trans = power_trans(1/.7)) +
    scale_color_gradient(low = fg_1, high = fg_0) +
    labs(caption = "#TidyTuesday") +
    plot_theme +
    theme(
        plot.caption  = element_markdown(color = fg_1, size = 25, face = "bold", lineheight = 1.2)
    )

# Background image
bg_img <- ggplot(df, aes(x = x, y = y)) +
  geom_blank() +
  annotation_custom(image, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf) +
  plot_theme

###########################
##     OUTPUT PARAMS     ##
###########################

# text_height = 1
text_height = 1
th <- text_height

# bg_x = 0.12
bg_x = 0.11
bg_y = 0.005

layout <- c(
  area(t = 1,      l = 1, b = 2,      r = 6),
  area(t = 1 + th, l = 1, b = 6 + th, r = 6),
  area(t = 1 + th, l = 1, b = 6 + th, r = 6)
)

####################
##     OUTPUT     ##
####################

word_count

pdf("sherlock-cloud.pdf", height = 12 + ( 2 * th ), width = 12)

header /
    ( cloud + inset_element(bg_img,
                            left = bg_x, bottom = bg_y, right = 1 + bg_x, top = 1 + bg_y,
                            on_top = FALSE)
    ) /
    plot_layout(design = layout, widths = 6, heights = c(th, 6, 6)) +
    plot_annotation(theme = pdf_theme)

dev.off()