azureorange.xyz


Sherlock Holmes

This weeks TidyTuesday data explored the complete line-by-line texts of the Sherlock Holmes stories and novels. The dataset includes the full version of texts, organized by book and line number.

As this is my first TidyTuesday, I wanted to try something simple. Therefore I evaluated the most common words across all texts and plotted them in a word cloud.

To make things a little more interesting, I shaped the word cloud in the form of Sherlock’s head, which I took from a book cover of The hound of Baskerville.

Word n
holmes 2403
time 879
sir 846
watson 809
house 773
night 718
door 687
hand 649
found 570
eyes 553

Sherlock Holmes shaped Word Cloud representing the most common words across all stories

#!/bin/Rscript

#######################
##     LIBRARIES     ##
#######################

# data processing
library(tidyverse)
library(tidytext)

# word cloud
library(ggwordcloud)
library(ggtext)
library(svgtools)

# output
library(patchwork)

##################
##     DATA     ##
##################

holmes <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-11-18/holmes.csv')

tokens <- holmes %>%
    unnest_tokens(word, text) %>%
    filter(!is.na(word))

tokens_clean <- tokens %>%
    anti_join(stop_words, by="word")

word_count <- tokens_clean %>%
    count(word, sort=TRUE) %>%
    slice(1:150) %>%
    mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(60, 40)))

####################
##     COLORS     ##
####################

fg_0 <- "bisque1"
fg_1 <- "bisque4"

bg <- "darkslategray"

#########################
##     PLOT PARAMS     ##
#########################

sherlock.png <- "assets/sherlock.png"
sherlock.svg <- "assets/sherlock.svg"

mask <- png::readPNG(sherlock.png)
image <- svgparser::read_svg(sherlock.svg, xoffset = 0, yoffset = 0)

plot_theme <- theme_void() +
    theme(
        plot.margin=unit(c(0,0,0,0), 'cm'),
        panel.background = element_rect(fill='transparent'),
        plot.background = element_rect(fill='transparent', color=NA)
    )

pdf_theme <- theme_void() +
    theme(
        plot.margin=unit(c(1,1,1,1), 'cm'),
        panel.background = element_rect(fill = bg, color = bg),
        plot.background  = element_rect(fill = bg, color = bg)
    )

###################
##     PLOTS     ##
###################

set.seed(123)

# header and footer
df <- data.frame(x = 1:10, y = 1:10)
header <- ggplot(df, aes(x = x, y = y)) +
  geom_blank() +
  labs(
        title = "Sherlock Holmes",
        subtitle = "Most common words across all books"
        ) +
  plot_theme +
  theme(
    plot.title    = element_text(color = fg_0, size = 69, face = "bold"),
    plot.subtitle = element_text(color = fg_1, size = 42, face = "bold")
  )

# Word Cloud
cloud <- ggplot(
        word_count,
        aes(label = word, size = n,
            color = n, angle = angle
        )
    ) +
    geom_text_wordcloud_area(
        mask = mask,
        rm_outside = TRUE
    ) +
    scale_size_area(max_size = 69, trans = power_trans(1/.7)) +
    scale_color_gradient(low = fg_1, high = fg_0) +
    labs(caption = "#TidyTuesday") +
    plot_theme +
    theme(
        plot.caption  = element_markdown(color = fg_1, size = 25, face = "bold", lineheight = 1.2)
    )

# Background image
bg_img <- ggplot(df, aes(x = x, y = y)) +
  geom_blank() +
  annotation_custom(image, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf) +
  plot_theme

###########################
##     OUTPUT PARAMS     ##
###########################

# text_height = 1
text_height = 1
th <- text_height

# bg_x = 0.12
bg_x = 0.11
bg_y = 0.005

layout <- c(
  area(t = 1,      l = 1, b = 2,      r = 6),
  area(t = 1 + th, l = 1, b = 6 + th, r = 6),
  area(t = 1 + th, l = 1, b = 6 + th, r = 6)
)

####################
##     OUTPUT     ##
####################

word_count

pdf("sherlock-cloud.pdf", height = 12 + ( 2 * th ), width = 12)

header /
    ( cloud + inset_element(bg_img,
                            left = bg_x, bottom = bg_y, right = 1 + bg_x, top = 1 + bg_y,
                            on_top = FALSE)
    ) /
    plot_layout(design = layout, widths = 6, heights = c(th, 6, 6)) +
    plot_annotation(theme = pdf_theme)

dev.off()