Example of data wrangling in R (part 1)

Load packages

library(readr)
library(dplyr)
library(ggplot2)

Load dataset

by_tag_year <- read_csv("by_tag_year.csv")

Inspect dataset

head(by_tag_year)
## # A tibble: 6 x 4
##    year tag           number year_total
##   <dbl> <chr>          <dbl>      <dbl>
## 1  2008 .htaccess         54      58390
## 2  2008 .net            5910      58390
## 3  2008 .net-2.0         289      58390
## 4  2008 .net-3.5         319      58390
## 5  2008 .net-4.0           6      58390
## 6  2008 .net-assembly      3      58390

Add fraction column

by_tag_year_fraction <- by_tag_year %>% mutate(fraction = number/year_total)

Filter for R tags

r_over_time <- by_tag_year_fraction %>% filter(tag=="r")

Create a line plot of fraction over time

ggplot(r_over_time, aes(x=year, y=fraction))+
  geom_line()

A vector of selected tags

selected_tags <- c("r", "dplyr", "ggplot2")

Filter for those tags

selected_tags_over_time <- by_tag_year_fraction %>% filter(tag %in% selected_tags)

Plot tags over time on a line plot using color to represent tag

ggplot(selected_tags_over_time, aes(x=year, y= fraction, color=tag))+
  geom_line()

Find total number of questions for each tag

sorted_tags <- by_tag_year %>%
  group_by(tag)%>%
  summarise(tag_total = sum(number))%>%
  arrange(desc(tag_total))
## `summarise()` ungrouping output (override with `.groups` argument)

Get tags of interest

my_tags <- c("android", "ios" , "windows-phone")

Filter for those tags

by_tag_subset <- by_tag_year_fraction%>%
  filter(tag %in% my_tags)

Plot tags over time on a line plot using color to represent tag

ggplot(by_tag_subset, aes(x=year, y=fraction, color=tag))+
  geom_line()

Amy Jones
Amy Jones

My research interests include autism, psychosis and alexithymia. matter.

Related