Joining data with dplyr

By Salerno | October 8, 2022

1) Introduction

What about knowing more about some important concepts around manipulating data sets? We definitely know how it is tough working in the real problems world, a lot of information sprawled in many databases.

With the knowledge shared below you’ll probably find a solution that fits in your day-a-day as data scientists.

Enjoy my folks!

2) Datasets


parts <- readRDS('C://Users//andre//OneDrive//Documentos//Private//Salerno//Pessoal//Cursos//datacamp//dplyr//parts.rds')

parts_categories <- readRDS('C://Users//andre//OneDrive//Documentos//Private//Salerno//Pessoal//Cursos//datacamp//dplyr//part_categories.rds')

inventories <- readRDS('C://Users//andre//OneDrive//Documentos//Private//Salerno//Pessoal//Cursos//datacamp//dplyr//inventories.rds')

inventory_parts <- readRDS('C://Users//andre//OneDrive//Documentos//Private//Salerno//Pessoal//Cursos//datacamp//dplyr//inventory_parts.rds')

sets <- readRDS('C://Users//andre//OneDrive//Documentos//Private//Salerno//Pessoal//Cursos//datacamp//dplyr//sets.rds')

colors <- readRDS('C://Users//andre//OneDrive//Documentos//Private//Salerno//Pessoal//Cursos//datacamp//dplyr//colors.rds')

themes <- readRDS('C://Users//andre//OneDrive//Documentos//Private//Salerno//Pessoal//Cursos//datacamp//dplyr//themes.rds')

color_palette <- readxl::read_xlsx("C://Users//andre//OneDrive//Documentos//Private//Salerno//Pessoal//Cursos//datacamp//dplyr//color_palette.xlsx")

questions <- readRDS("C://Users//andre//OneDrive//Documentos//Private//Salerno//Pessoal//Cursos//datacamp//dplyr//questions.rds")

question_tags <- readRDS("C://Users//andre//OneDrive//Documentos//Private//Salerno//Pessoal//Cursos//datacamp//dplyr//question_tags.rds")

tags <- readRDS("C://Users//andre//OneDrive//Documentos//Private//Salerno//Pessoal//Cursos//datacamp//dplyr//tags.rds")

answers <- readRDS("C://Users//andre//OneDrive//Documentos//Private//Salerno//Pessoal//Cursos//datacamp//dplyr//answers.rds")

3) Knowing dimensionals


dim(parts)
## [1] 17501     3
colnames(parts)
## [1] "part_num"    "name"        "part_cat_id"

dim(parts_categories)
## [1] 64  2
colnames(parts_categories)
## [1] "id"   "name"

dim(inventories)
## [1] 15174     3
colnames(inventories)
## [1] "id"      "version" "set_num"

dim(inventory_parts)
## [1] 258958      4
colnames(inventory_parts)
## [1] "inventory_id" "part_num"     "color_id"     "quantity"

dim(sets)
## [1] 4977    4
colnames(sets)
## [1] "set_num"  "name"     "year"     "theme_id"

dim(colors)
## [1] 179   3
colnames(colors)
## [1] "id"   "name" "rgb"

4) Joining parts and part categories

So, in this stage we’ve decided to create a new object called parts_join as a result of the use of parts dataset and parts_categories, unified by columns named part_cat_id (first dataset) and id (second dataset) respectively.

As mention above, before joining we have 3 columns by the parts data set, and parts_catogories has 2 columns.

As a result of this joining process, we’ll have 3 columns that were unified by the attributes part_cat_id from the parts and id from the parts_categories dataset.


parts %>% 
  count(part_cat_id, sort = TRUE)
## # A tibble: 63 × 2
##    part_cat_id     n
##          <dbl> <int>
##  1          60  2091
##  2           4  1900
##  3          59  1565
##  4          27   937
##  5          61   805
##  6          19   804
##  7          65   803
##  8          41   701
##  9          28   529
## 10          58   518
## # … with 53 more rows

parts_categories %>% 
  count(id, sort = TRUE)
## # A tibble: 64 × 2
##       id     n
##    <dbl> <int>
##  1     1     1
##  2     3     1
##  3     4     1
##  4     5     1
##  5     6     1
##  6     7     1
##  7     8     1
##  8     9     1
##  9    11     1
## 10    12     1
## # … with 54 more rows

There is an important verb called inner join. Check out the image below:

Inner Join. Source: DataCamp We obtain a new table where the attributes in the first table is incremented with the columns with the second table.


parts_join <- parts %>% 
      inner_join(parts_categories, by = c("part_cat_id" = "id"), suffix = c("_part", "_category"))

head(parts_join)
## # A tibble: 6 × 4
##   part_num name_part                                   part_cat_id name_category
##   <chr>    <chr>                                             <dbl> <chr>        
## 1 0901     Baseplate 16 x 30 with Set 080 Yellow Hous…           1 Baseplates   
## 2 0902     Baseplate 16 x 24 with Set 080 Small White…           1 Baseplates   
## 3 0903     Baseplate 16 x 24 with Set 080 Red House P…           1 Baseplates   
## 4 0904     Baseplate 16 x 24 with Set 080 Large White…           1 Baseplates   
## 5 1        Homemaker Bookcase 2 x 4 x 4                          7 Containers   
## 6 10016414 Sticker Sheet #1 for 41055-1                         58 Stickers

colnames(parts_join)
## [1] "part_num"      "name_part"     "part_cat_id"   "name_category"

colnames(parts)
## [1] "part_num"    "name"        "part_cat_id"
parts %>% 
  count(part_num)
## # A tibble: 17,501 × 2
##    part_num       n
##    <chr>      <int>
##  1 0901           1
##  2 0902           1
##  3 0903           1
##  4 0904           1
##  5 1              1
##  6 10016414       1
##  7 10026stk01     1
##  8 10039          1
##  9 10048          1
## 10 10049          1
## # … with 17,491 more rows

colnames(inventory_parts)
## [1] "inventory_id" "part_num"     "color_id"     "quantity"
inventory_parts %>% 
  count(part_num)
## # A tibble: 17,501 × 2
##    part_num       n
##    <chr>      <int>
##  1 0901           1
##  2 0902           1
##  3 0903           1
##  4 0904           1
##  5 1              7
##  6 10016414       1
##  7 10026stk01     1
##  8 10039          4
##  9 10048          3
## 10 10049          2
## # … with 17,491 more rows

parts %>%
  inner_join(inventory_parts, by = c("part_num" = "part_num"))
## # A tibble: 258,958 × 6
##    part_num name                      part_cat_id inventory_id color_id quantity
##    <chr>    <chr>                           <dbl>        <dbl>    <dbl>    <dbl>
##  1 0901     Baseplate 16 x 30 with S…           1         1973        2        1
##  2 0902     Baseplate 16 x 24 with S…           1         1973        2        1
##  3 0903     Baseplate 16 x 24 with S…           1         1973        2        1
##  4 0904     Baseplate 16 x 24 with S…           1         1973        2        1
##  5 1        Homemaker Bookcase 2 x 4…           7          508       15        1
##  6 1        Homemaker Bookcase 2 x 4…           7         1158       15        2
##  7 1        Homemaker Bookcase 2 x 4…           7         6590       15        2
##  8 1        Homemaker Bookcase 2 x 4…           7         9679       15        2
##  9 1        Homemaker Bookcase 2 x 4…           7        12256        1        2
## 10 1        Homemaker Bookcase 2 x 4…           7        13356       15        1
## # … with 258,948 more rows

5) Joining two tables


sets %>%
  # Add inventories using an inner join 
  inner_join(inventories, by = c('set_num' = 'set_num' )) %>%
  # Add inventory_parts using an inner join 
  inner_join(inventory_parts, by = c('id' = 'inventory_id'))
## # A tibble: 258,958 × 9
##    set_num name           year theme_id    id version part_num color_id quantity
##    <chr>   <chr>         <dbl>    <dbl> <dbl>   <dbl> <chr>       <dbl>    <dbl>
##  1 700.3-1 Medium Gift …  1949      365 24197       1 bdoor01         2        2
##  2 700.3-1 Medium Gift …  1949      365 24197       1 bdoor01        15        1
##  3 700.3-1 Medium Gift …  1949      365 24197       1 bdoor01         4        1
##  4 700.3-1 Medium Gift …  1949      365 24197       1 bslot02        15        6
##  5 700.3-1 Medium Gift …  1949      365 24197       1 bslot02         2        6
##  6 700.3-1 Medium Gift …  1949      365 24197       1 bslot02         4        6
##  7 700.3-1 Medium Gift …  1949      365 24197       1 bslot02         1        6
##  8 700.3-1 Medium Gift …  1949      365 24197       1 bslot02        14        6
##  9 700.3-1 Medium Gift …  1949      365 24197       1 bslot02a       15        6
## 10 700.3-1 Medium Gift …  1949      365 24197       1 bslot02a        2        6
## # … with 258,948 more rows
colnames(sets)
## [1] "set_num"  "name"     "year"     "theme_id"
colnames(inventories)
## [1] "id"      "version" "set_num"
colnames(inventory_parts)
## [1] "inventory_id" "part_num"     "color_id"     "quantity"
colnames(colors)
## [1] "id"   "name" "rgb"

6) Joining three tables


# Add an inner join for the colors table
sets %>%
  inner_join(inventories, by = "set_num") %>%
  inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
  inner_join(colors, by = c("color_id" = "id"), suffix = c("_set", "_color"))
## # A tibble: 258,958 × 11
##    set_num name_set       year theme_id    id version part_num color_id quantity
##    <chr>   <chr>         <dbl>    <dbl> <dbl>   <dbl> <chr>       <dbl>    <dbl>
##  1 700.3-1 Medium Gift …  1949      365 24197       1 bdoor01         2        2
##  2 700.3-1 Medium Gift …  1949      365 24197       1 bdoor01        15        1
##  3 700.3-1 Medium Gift …  1949      365 24197       1 bdoor01         4        1
##  4 700.3-1 Medium Gift …  1949      365 24197       1 bslot02        15        6
##  5 700.3-1 Medium Gift …  1949      365 24197       1 bslot02         2        6
##  6 700.3-1 Medium Gift …  1949      365 24197       1 bslot02         4        6
##  7 700.3-1 Medium Gift …  1949      365 24197       1 bslot02         1        6
##  8 700.3-1 Medium Gift …  1949      365 24197       1 bslot02        14        6
##  9 700.3-1 Medium Gift …  1949      365 24197       1 bslot02a       15        6
## 10 700.3-1 Medium Gift …  1949      365 24197       1 bslot02a        2        6
## # … with 258,948 more rows, and 2 more variables: name_color <chr>, rgb <chr>

# Count the number of colors and sort
sets %>%
  inner_join(inventories, by = "set_num") %>%
  inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
  inner_join(colors, by = c("color_id" = "id"), suffix = c("_set", "_color")) %>%
  count(name_color, sort = TRUE)
## # A tibble: 134 × 2
##    name_color            n
##    <chr>             <int>
##  1 Black             48068
##  2 White             30105
##  3 Light Bluish Gray 26024
##  4 Red               21602
##  5 Dark Bluish Gray  19948
##  6 Yellow            17088
##  7 Blue              12980
##  8 Light Gray         8632
##  9 Reddish Brown      6960
## 10 Tan                6664
## # … with 124 more rows

inventory_parts_joined <- inventories %>% 
  inner_join(inventory_parts, by = c("id" = "inventory_id")) %>% 
  select(-id, -version) %>% 
  arrange(desc(quantity))

inventory_parts_joined
## # A tibble: 258,958 × 4
##    set_num  part_num color_id quantity
##    <chr>    <chr>       <dbl>    <dbl>
##  1 40179-1  3024           72      900
##  2 40179-1  3024           15      900
##  3 40179-1  3024            0      900
##  4 40179-1  3024           71      900
##  5 40179-1  3024           14      900
##  6 k34434-1 3024           15      810
##  7 21010-1  3023          320      771
##  8 k34431-1 3024            0      720
##  9 42083-1  2780            0      684
## 10 k34434-1 3024            0      540
## # … with 258,948 more rows

7) Left joining two sets by part and color

According with the image below, the left join function maintain all the rows in the table on the left, while increment some attributes matched in the second table.

Left Join. Source: DataCamp


batmobile <- inventory_parts_joined %>% 
  filter(set_num == "7784-1") %>% 
  select(-set_num)

batwing <- inventory_parts_joined %>% 
  filter(set_num == "70916-1") %>% 
  select(-set_num)

batmobile %>% 
  inner_join(batwing, by = c("part_num", "color_id"), suffix = c("_batmobile", "_batwing"))
## # A tibble: 45 × 4
##    part_num color_id quantity_batmobile quantity_batwing
##    <chr>       <dbl>              <dbl>            <dbl>
##  1 2780            0                 28               17
##  2 50950           0                 28                2
##  3 3004           71                 26                2
##  4 43093           1                 25                6
##  5 3004            0                 23                4
##  6 3622            0                 18                2
##  7 4286            0                 16                1
##  8 3039            0                 12                2
##  9 4274           71                 12                7
## 10 3001            0                 11                4
## # … with 35 more rows

batmobile %>%
  left_join(batwing, by =c("part_num","color_id"), suffix =c("_batmobile","_batwing"))
## # A tibble: 173 × 4
##    part_num color_id quantity_batmobile quantity_batwing
##    <chr>       <dbl>              <dbl>            <dbl>
##  1 3023           72                 62               NA
##  2 2780            0                 28               17
##  3 50950           0                 28                2
##  4 3004           71                 26                2
##  5 43093           1                 25                6
##  6 3004            0                 23                4
##  7 3010            0                 21               NA
##  8 30363           0                 21               NA
##  9 32123b         14                 19               NA
## 10 3622            0                 18                2
## # … with 163 more rows

millennium_falcon <- inventory_parts_joined %>% 
  filter(set_num == "7965-1")

star_destroyer <- inventory_parts_joined %>% 
  filter(set_num == "75190-1")

# Combine the star_destroyer and millennium_falcon tables
millennium_falcon %>%
  left_join(star_destroyer, by = c("part_num", "color_id"), suffix = c("_falcon", "_star_destroyer"))
## # A tibble: 263 × 6
##    set_num_falcon part_num color_id quantity_falcon set_num_star_destroyer
##    <chr>          <chr>       <dbl>           <dbl> <chr>                 
##  1 7965-1         63868          71              62 <NA>                  
##  2 7965-1         3023            0              60 <NA>                  
##  3 7965-1         3021           72              46 75190-1               
##  4 7965-1         2780            0              37 75190-1               
##  5 7965-1         60478          72              36 <NA>                  
##  6 7965-1         6636           71              34 75190-1               
##  7 7965-1         3009           71              28 75190-1               
##  8 7965-1         3665           71              22 <NA>                  
##  9 7965-1         2412b          72              20 75190-1               
## 10 7965-1         3010           71              19 <NA>                  
## # … with 253 more rows, and 1 more variable: quantity_star_destroyer <dbl>

8) Left joining two sets by color


# Aggregate Millennium Falcon for the total quantity in each part
millennium_falcon_colors <- millennium_falcon %>%
  group_by(color_id) %>%
  summarize(total_quantity = sum(quantity))

# Aggregate Star Destroyer for the total quantity in each part
star_destroyer_colors <- star_destroyer %>%
  group_by(color_id) %>%
  summarize(total_quantity = sum(quantity))

# Left join the Millennium Falcon colors to the Star Destroyer colors
millennium_falcon_colors %>%
  left_join(star_destroyer_colors, by = c("color_id"), suffix = c("_falcon", "_star_destroyer"))
## # A tibble: 21 × 3
##    color_id total_quantity_falcon total_quantity_star_destroyer
##       <dbl>                 <dbl>                         <dbl>
##  1        0                   201                           336
##  2        1                    15                            23
##  3        4                    17                            53
##  4       14                     3                             4
##  5       15                    15                            17
##  6       19                    95                            12
##  7       28                     3                            16
##  8       33                     5                            NA
##  9       36                     1                            14
## 10       41                     6                            15
## # … with 11 more rows

inventory_version_1 <- inventories %>%
  filter(version == 1)

# Join versions to sets
sets %>%
  left_join(inventory_version_1, by = "set_num") %>%
  # Filter for where version is na
  filter(is.na(version))
## # A tibble: 1 × 6
##   set_num name       year theme_id    id version
##   <chr>   <chr>     <dbl>    <dbl> <dbl>   <dbl>
## 1 40198-1 Ludo game  2018      598    NA      NA

9) Counting part colors

In this chapter, we introduce the right join verb, which describes the match between two tables, and the result in concentrate on the right table. Check the image out below:

Right Join. Source: DataCamp


parts %>%
  # Count the part_cat_id
  count(part_cat_id) %>%
  # Right join part_categories
  right_join(parts_categories, by = c("part_cat_id" = "id")) %>% 
  filter(is.na(n))
## # A tibble: 1 × 3
##   part_cat_id     n name   
##         <dbl> <int> <chr>  
## 1          66    NA Modulex

parts %>%
  count(part_cat_id) %>%
  right_join(parts_categories, by = c("part_cat_id" = "id")) %>%
  # Use replace_na to replace missing values in the n column
  tidyr::replace_na(list(n = 0))
## # A tibble: 64 × 3
##    part_cat_id     n name                   
##          <dbl> <int> <chr>                  
##  1           1   135 Baseplates             
##  2           3   303 Bricks Sloped          
##  3           4  1900 Duplo, Quatro and Primo
##  4           5   107 Bricks Special         
##  5           6   128 Bricks Wedged          
##  6           7    97 Containers             
##  7           8    24 Technic Bricks         
##  8           9   167 Plates Special         
##  9          11   490 Bricks                 
## 10          12    85 Technic Connectors     
## # … with 54 more rows

10) Joining themes to their children


themes %>% 
  # Inner join the themes table
  inner_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
  # Filter for the "Harry Potter" parent name 
  filter(name_parent == "Harry Potter")
## # A tibble: 6 × 5
##      id name_parent  parent_id id_child name_child          
##   <dbl> <chr>            <dbl>    <dbl> <chr>               
## 1   246 Harry Potter        NA      247 Chamber of Secrets  
## 2   246 Harry Potter        NA      248 Goblet of Fire      
## 3   246 Harry Potter        NA      249 Order of the Phoenix
## 4   246 Harry Potter        NA      250 Prisoner of Azkaban 
## 5   246 Harry Potter        NA      251 Sorcerer's Stone    
## 6   246 Harry Potter        NA      667 Fantastic Beasts

11) Joining themes to their grandchildren


# Join themes to itself again to find the grandchild relationships
themes %>% 
  inner_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
  inner_join(themes, by = c("id_child" = "parent_id"), suffix = c("_parent", "_grandchild"))
## # A tibble: 158 × 7
##    id_parent name_parent parent_id id_child name_child id_grandchild name       
##        <dbl> <chr>           <dbl>    <dbl> <chr>              <dbl> <chr>      
##  1         1 Technic            NA        5 Model                  6 Airport    
##  2         1 Technic            NA        5 Model                  7 Constructi…
##  3         1 Technic            NA        5 Model                  8 Farm       
##  4         1 Technic            NA        5 Model                  9 Fire       
##  5         1 Technic            NA        5 Model                 10 Harbor     
##  6         1 Technic            NA        5 Model                 11 Off-Road   
##  7         1 Technic            NA        5 Model                 12 Race       
##  8         1 Technic            NA        5 Model                 13 Riding Cyc…
##  9         1 Technic            NA        5 Model                 14 Robot      
## 10         1 Technic            NA        5 Model                 15 Traffic    
## # … with 148 more rows

12) Left joining a table to itself


themes %>% 
  # Left join the themes table to its own children
  left_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
  # Filter for themes that have no child themes
  filter(is.na(name_child))
## # A tibble: 586 × 5
##       id name_parent    parent_id id_child name_child
##    <dbl> <chr>              <dbl>    <dbl> <chr>     
##  1     2 Arctic Technic         1       NA <NA>      
##  2     3 Competition            1       NA <NA>      
##  3     4 Expert Builder         1       NA <NA>      
##  4     6 Airport                5       NA <NA>      
##  5     7 Construction           5       NA <NA>      
##  6     8 Farm                   5       NA <NA>      
##  7     9 Fire                   5       NA <NA>      
##  8    10 Harbor                 5       NA <NA>      
##  9    11 Off-Road               5       NA <NA>      
## 10    12 Race                   5       NA <NA>      
## # … with 576 more rows

13) Differences between Batman and Star Wars


dim(inventory_parts_joined)
## [1] 258958      4

# Start with inventory_parts_joined table
inventory_sets_themes <- inventory_parts_joined %>%
      # Combine with the sets table 
      inner_join(sets, by = "set_num") %>%
      # Combine with the themes table
      inner_join(themes, by = c("theme_id" = "id"), suffix = c("_set", "_theme"))

dim(inventory_sets_themes)
## [1] 258958      9

14) Aggregating each theme


batman <- inventory_sets_themes %>%
  filter(name_theme == "Batman")

star_wars <- inventory_sets_themes %>%
  filter(name_theme == "Star Wars")

# Count the part number and color id, weight by quantity
batman_parts <- batman %>%
  count(part_num, color_id, wt = quantity)

star_wars_parts <- star_wars %>%
  count(part_num, color_id, wt = quantity)

15) Full joining Batman and Star Wars LEGO parts

The full join aggregates the rows from the two tables (left and join) even though than does not have any match.

Full Join. Source: DataCamp


parts_joined <- batman_parts %>%
  # Combine the star_wars_parts table 
  full_join(star_wars_parts, by = c("part_num", "color_id"), suffix = c("_batman", "_star_wars")) %>%
  # Replace NAs with 0s in the n_batman and n_star_wars columns 
  tidyr::replace_na(list(n_batman = 0,
                  n_star_wars = 0))

16) Comparing Batman and Star Wars LEGO parts


parts_joined %>%
  # Sort the number of star wars pieces in descending order 
  arrange(desc(n_star_wars)) %>%
  # Join the colors table to the parts_joined table
  inner_join(colors, by = c("color_id" = "id")) %>%
  # Join the parts table to the previous join 
  inner_join(parts, by = "part_num", suffix = c("_color", "_part"))
## # A tibble: 3,628 × 8
##    part_num color_id n_batman n_star_wars name_color rgb   name_part part_cat_id
##    <chr>       <dbl>    <dbl>       <dbl> <chr>      <chr> <chr>           <dbl>
##  1 2780            0      104         392 Black      #051… Technic …          53
##  2 32062           0        1         141 Black      #051… Technic …          46
##  3 4274            1       56         118 Blue       #005… Technic …          53
##  4 6141           36       11         117 Trans-Red  #C91… Plate Ro…          21
##  5 3023           71       10         106 Light Blu… #A0A… Plate 1 …          14
##  6 6558            1       30         106 Blue       #005… Technic …          53
##  7 43093           1       44          99 Blue       #005… Technic …          53
##  8 3022           72       14          95 Dark Blui… #6C6… Plate 2 …          14
##  9 2357           19        0          84 Tan        #E4C… Brick 2 …          11
## 10 6141          179       90          81 Flat Silv… #898… Plate Ro…          21
## # … with 3,618 more rows

17) Something within one set but not another

With the semi join verb it aggregates only when you find match between left and right tables.

Semi Join. Source: DataCamp


# Filter the batwing set for parts that are also in the batmobile set
batwing %>%
  semi_join(batmobile, by = c("part_num"))
## # A tibble: 126 × 3
##    part_num color_id quantity
##    <chr>       <dbl>    <dbl>
##  1 3023            0       22
##  2 3024            0       22
##  3 3623            0       20
##  4 2780            0       17
##  5 3666            0       16
##  6 3710            0       14
##  7 6141            4       12
##  8 2412b          71       10
##  9 6141           72       10
## 10 6558            1        9
## # … with 116 more rows

# Filter the batwing set for parts that aren't in the batmobile set
batwing %>%
  anti_join(batmobile, by = c("part_num"))
## # A tibble: 183 × 3
##    part_num color_id quantity
##    <chr>       <dbl>    <dbl>
##  1 11477           0       18
##  2 99207          71       18
##  3 22385           0       14
##  4 99563           0       13
##  5 10247          72       12
##  6 2877           72       12
##  7 61409          72       12
##  8 11153           0       10
##  9 98138          46       10
## 10 2419           72        9
## # … with 173 more rows

18) What colors are included in at least one set?


# Use inventory_parts to find colors included in at least one set
colors %>%
  semi_join(inventory_parts, by = c("id" = "color_id"))
## # A tibble: 134 × 3
##       id name           rgb    
##    <dbl> <chr>          <chr>  
##  1    -1 [Unknown]      #0033B2
##  2     0 Black          #05131D
##  3     1 Blue           #0055BF
##  4     2 Green          #237841
##  5     3 Dark Turquoise #008F9B
##  6     4 Red            #C91A09
##  7     5 Dark Pink      #C870A0
##  8     6 Brown          #583927
##  9     7 Light Gray     #9BA19D
## 10     8 Dark Gray      #6D6E5C
## # … with 124 more rows

19) Which set is missing version 1?


# Use filter() to extract version 1 
version_1_inventories <- inventories %>%
  filter(version == 1)

# Use anti_join() to find which set is missing a version 1
sets %>%
  anti_join(version_1_inventories, by = c("set_num" = "set_num"))
## # A tibble: 1 × 4
##   set_num name       year theme_id
##   <chr>   <chr>     <dbl>    <dbl>
## 1 40198-1 Ludo game  2018      598

20) Aggregating sets to look at their differences


inventory_parts_themes <- inventories %>%
  inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
  arrange(desc(quantity)) %>%
  select(-id, -version) %>%
  inner_join(sets, by = "set_num") %>%
  inner_join(themes, by = c("theme_id" = "id"), suffix = c("_set", "_theme"))

batman_colors <- inventory_parts_themes %>%
  # Filter the inventory_parts_themes table for the Batman theme
  filter(name_theme == "Batman") %>%
  group_by(color_id) %>%
  summarize(total = sum(quantity)) %>%
  # Add a fraction column of the total divided by the sum of the total 
  mutate(fraction = total / sum(total))

# Filter and aggregate the Star Wars set data; add a fraction column
star_wars_colors <- inventory_parts_themes %>%
  filter(name_theme == "Star Wars") %>%
  group_by(color_id) %>%
  summarize(total = sum(quantity)) %>%
  # Add a fraction column of the total divided by the sum of the total 
  mutate(fraction = total / sum(total))
	

21) Combining sets


batman_colors %>%
  full_join(star_wars_colors, by = "color_id", suffix = c("_batman", "_star_wars")) %>%
  tidyr::replace_na(list(total_batman = 0, total_star_wars = 0)) %>%
  inner_join(colors, by = c("color_id" = "id")) %>%
  # Create the difference and total columns
  mutate(difference = fraction_batman - fraction_star_wars,
         total = total_batman + total_star_wars) %>%
  # Filter for totals greater than 200
  filter(total > 200) %>% 
  arrange(desc(difference))
## # A tibble: 16 × 9
##    color_id total_batman fraction_batman total_star_wars fraction_star_wa… name 
##       <dbl>        <dbl>           <dbl>           <dbl>             <dbl> <chr>
##  1        0         2807         0.296              3258           0.207   Black
##  2       14          426         0.0449              207           0.0132  Yell…
##  3        4          529         0.0558              434           0.0276  Red  
##  4       84          278         0.0293               31           0.00197 Medi…
##  5       46          200         0.0211               39           0.00248 Tran…
##  6       70          297         0.0313              373           0.0237  Redd…
##  7      179          154         0.0162              232           0.0148  Flat…
##  8        1          243         0.0256              410           0.0261  Blue 
##  9       28           98         0.0103              183           0.0116  Dark…
## 10       72         1453         0.153              2433           0.155   Dark…
## 11       36           86         0.00907             246           0.0156  Tran…
## 12      378           22         0.00232             430           0.0273  Sand…
## 13       19          142         0.0150             1012           0.0644  Tan  
## 14       15          404         0.0426             1771           0.113   White
## 15       71         1148         0.121              3264           0.208   Ligh…
## 16        7            0        NA                   209           0.0133  Ligh…
## # … with 3 more variables: rgb <chr>, difference <dbl>, total <dbl>

22) Visualizing the difference: Batman and Star Wars

library(forcats)
## Warning: package 'forcats' was built under R version 4.2.1

colors_joined <- batman_colors %>%
  full_join(star_wars_colors, by = "color_id", suffix = c("_batman", "_star_wars")) %>%
  tidyr::replace_na(list(total_batman = 0, total_star_wars = 0)) %>%
  inner_join(colors, by = c("color_id" = "id")) %>%
  mutate(difference = fraction_batman - fraction_star_wars,
         total = total_batman + total_star_wars) %>%
  filter(total >= 200) %>%
  mutate(name = fct_reorder(name, difference)) 

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.1

#color_palette_fct <- as.vector(color_palette$color_name)

color_palette<- as.vector(color_palette$id)

#color_palette <- factor(color_palette$id, levels = color_palette_fct)

# Create a bar plot using colors_joined and the name and difference columns
ggplot(colors_joined, aes(name, difference, fill = name)) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(values = color_palette, guide = "none") +
  labs(y = "Difference: Batman - Star Wars")
## Warning: Removed 1 rows containing missing values (position_stack).

# Replace the NAs in the tag_name column
questions_with_tags <- questions %>%
  left_join(question_tags, by = c("id" = "question_id")) %>%
  left_join(tags, by = c("tag_id" = "id")) %>%
  tidyr::replace_na(list(tag_name = "only-r"))

24) Comparing scores across tags


questions_with_tags %>% 
  # Group by tag_name
  group_by(tag_name) %>%
  # Get mean score and num_questions
  summarize(score = mean(score),
        	num_questions = n()) %>%
  # Sort num_questions in descending order
  arrange(desc(num_questions))
## # A tibble: 7,841 × 3
##    tag_name   score num_questions
##    <chr>      <dbl>         <int>
##  1 only-r     1.26          48541
##  2 ggplot2    2.61          28228
##  3 dataframe  2.31          18874
##  4 shiny      1.45          14219
##  5 dplyr      1.95          14039
##  6 plot       2.24          11315
##  7 data.table 2.97           8809
##  8 matrix     1.66           6205
##  9 loops      0.743          5149
## 10 regex      2              4912
## # … with 7,831 more rows

25) What tags never appear on R questions?

And if you looking for some events that does not appear in the right table? So, in this case we recommend to use anti join.

Anti Join. Source: DataCamp


# Using a join, filter for tags that are never on an R question
tags %>%
  anti_join(question_tags, by = c("id" = "tag_id"))
## # A tibble: 40,459 × 2
##        id tag_name                 
##     <dbl> <chr>                    
##  1 124399 laravel-dusk             
##  2 124402 spring-cloud-vault-config
##  3 124404 spring-vault             
##  4 124405 apache-bahir             
##  5 124407 astc                     
##  6 124408 simulacrum               
##  7 124410 angulartics2             
##  8 124411 django-rest-viewsets     
##  9 124414 react-native-lightbox    
## 10 124417 java-module              
## # … with 40,449 more rows

26) Finding gaps between questions and answers


questions %>%
  # Inner join questions and answers with proper suffixes
  inner_join(answers, by = c("id" = "question_id"), suffix = c("_question", "_answer")) %>%
  # Subtract creation_date_question from creation_date_answer to create gap
  mutate(gap = as.integer(creation_date_question - creation_date_answer))
## # A tibble: 380,643 × 7
##          id creation_date_question score_question id_answer creation_date_answer
##       <int> <date>                          <int>     <int> <date>              
##  1 22557677 2014-03-21                          1  22560670 2014-03-21          
##  2 22557707 2014-03-21                          2  22558516 2014-03-21          
##  3 22557707 2014-03-21                          2  22558726 2014-03-21          
##  4 22558084 2014-03-21                          2  22558085 2014-03-21          
##  5 22558084 2014-03-21                          2  22606545 2014-03-24          
##  6 22558084 2014-03-21                          2  22610396 2014-03-24          
##  7 22558084 2014-03-21                          2  34374729 2015-12-19          
##  8 22558395 2014-03-21                          2  22559327 2014-03-21          
##  9 22558395 2014-03-21                          2  22560102 2014-03-21          
## 10 22558395 2014-03-21                          2  22560288 2014-03-21          
## # … with 380,633 more rows, and 2 more variables: score_answer <int>, gap <int>

27) Joining question and answer counts


# Count and sort the question id column in the answers table
answer_counts <- answers %>%
  count(question_id, sort = TRUE)

# Combine the answer_counts and questions tables
questions %>%
  left_join(answer_counts, by = c("id" = "question_id")) %>%
  # Replace the NAs in the n column
  tidyr::replace_na(list(n = 0))
## # A tibble: 294,735 × 4
##          id creation_date score     n
##       <int> <date>        <int> <int>
##  1 22557677 2014-03-21        1     1
##  2 22557707 2014-03-21        2     2
##  3 22558084 2014-03-21        2     4
##  4 22558395 2014-03-21        2     3
##  5 22558613 2014-03-21        0     1
##  6 22558677 2014-03-21        2     2
##  7 22558887 2014-03-21        8     1
##  8 22559180 2014-03-21        1     1
##  9 22559312 2014-03-21        0     1
## 10 22559322 2014-03-21        2     5
## # … with 294,725 more rows

28) Joining questions, answers and tags


answer_counts <- answers %>%
    count(question_id, sort = TRUE)

question_answer_counts <- questions %>%
    left_join(answer_counts, by = c("id" = "question_id")) %>%
    tidyr::replace_na(list(n = 0))

question_answer_counts %>%
  # Join the question_tags tables
  inner_join(question_tags, by = c("id" = "question_id")) %>%
  # Join the tags table
  inner_join(tags, by = c("tag_id" = "id"))
## # A tibble: 497,153 × 6
##          id creation_date score     n tag_id tag_name           
##       <int> <date>        <int> <int>  <dbl> <chr>              
##  1 22557677 2014-03-21        1     1     18 regex              
##  2 22557677 2014-03-21        1     1    139 string             
##  3 22557677 2014-03-21        1     1  16088 time-complexity    
##  4 22557677 2014-03-21        1     1   1672 backreference      
##  5 22558084 2014-03-21        2     4   6419 time-series        
##  6 22558084 2014-03-21        2     4  92764 panel-data         
##  7 22558395 2014-03-21        2     3   5569 function           
##  8 22558395 2014-03-21        2     3    134 sorting            
##  9 22558395 2014-03-21        2     3   9412 vectorization      
## 10 22558395 2014-03-21        2     3  18621 operator-precedence
## # … with 497,143 more rows

29) Average answers by question


tagged_answers <- question_answer_counts %>%
    inner_join(question_tags, by = c("id" = "question_id")) %>%
    inner_join(tags, by = c("tag_id" = "id"))

tagged_answers %>%
  # Aggregate by tag_name
  group_by(tag_name)  %>%
  # Summarize questions and average_answers
  summarize(questions = n(),
            average_answers = mean(n)) %>%
  # Sort the questions in descending order
  arrange(desc(questions))
## # A tibble: 7,840 × 3
##    tag_name   questions average_answers
##    <chr>          <int>           <dbl>
##  1 ggplot2        28228           1.15 
##  2 dataframe      18874           1.67 
##  3 shiny          14219           0.921
##  4 dplyr          14039           1.55 
##  5 plot           11315           1.23 
##  6 data.table      8809           1.47 
##  7 matrix          6205           1.45 
##  8 loops           5149           1.39 
##  9 regex           4912           1.91 
## 10 function        4892           1.30 
## # … with 7,830 more rows

30) Joining questions and answers with tags


# Inner join the question_tags and tags tables with the questions table
questions_with_tags <- questions %>%
  inner_join(question_tags, by = c("id" = "question_id")) %>%
  inner_join(tags, by = c("tag_id" = "id"))

# Inner join the question_tags and tags tables with the answers table
answers_with_tags <- answers %>%
  inner_join(question_tags, by = "question_id") %>%
  inner_join(tags, by = c("tag_id" = "id"))

31) Binding and counting posts with tags


questions_with_tags <- questions %>%
  inner_join(question_tags, by = c("id" = "question_id")) %>%
  inner_join(tags, by = c("tag_id" = "id"))

answers_with_tags <- answers %>%
  inner_join(question_tags, by = "question_id") %>%
  inner_join(tags, by = c("tag_id" = "id"))

library(lubridate)
## Warning: package 'lubridate' was built under R version 4.2.1
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# Combine the two tables into posts_with_tags
posts_with_tags <- bind_rows(questions_with_tags %>% mutate(type = "question"),
                              answers_with_tags %>% mutate(type = "answer"))

# Add a year column, then count by type, year, and tag_name
by_type_year_tag <- posts_with_tags %>%
  mutate(year = year(creation_date)) %>%
  count(type, year, tag_name)

32) Visualizing questions and answers in tags

library(ggplot2)
# Filter for the dplyr and ggplot2 tag names 
by_type_year_tag_filtered <- by_type_year_tag %>%
  filter(tag_name %in% c("dplyr", "ggplot2"))

# Create a line plot faceted by the tag name 
ggplot(by_type_year_tag_filtered, aes(year, n, color = type)) +
  geom_line() +
  facet_wrap(~ tag_name)
comments powered by Disqus