Big Data Strategies in R

First some settings…

knitr::opts_chunk$set(warning = FALSE, message = FALSE)

R - tidyverse

library(tidyverse)

charts <- read_csv("data/charts.csv", n_max = 100000)
head(charts)

ABCDEFGHIJ0123456789

title <chr>	rank <dbl>	date <date>	artist <chr>
Despacito (Featuring Daddy Yankee)	1	2017-02-01	Luis Fonsi
Reggaetón Lento (Bailemos)	2	2017-02-01	CNCO
Chantaje (feat. Maluma)	3	2017-02-01	Shakira
Cuando Se Pone a Bailar	4	2017-02-01	Rombai
Otra vez (feat. J Balvin)	5	2017-02-01	Zion & Lennox
El Amante	6	2017-02-01	Nicky Jam

charts %>% 
  filter(chart == 'top200', region == 'Sweden') %>% 
  group_by(artist) %>% 
  summarise(
    n_dates = n_distinct(date), 
    n_songs = n_distinct(title)) %>% 
  arrange(desc(n_dates)) %>% 
  head(10)

ABCDEFGHIJ0123456789

artist <chr>	n_dates <int>	n_songs <int>
A R I Z O N A	8	2
Adele	8	5
AJR	8	1
Alan Walker	8	3
Alessia Cara	8	2
Alok, Bruno Martini, Zeeba	8	1
Ariana Grande, Nicki Minaj	8	1
Bebe Rexha	8	1
Big Sean	8	1
Bruno Mars	8	2

R - duckdb

library(DBI)
library(duckdb)
con = dbConnect(duckdb(), dbdir="duckdb_r_db")

dbExecute(con,"drop table if exists charts;")

## [1] 0

dbExecute(con, "
  create table charts as 
  select * from read_csv_auto('data/charts.csv');
")

## [1] 25001801

dbGetQuery(con, "select * from charts limit 10")

ABCDEFGHIJ0123456789

title <chr>	rank <int>	date <date>	artist <chr>
Despacito (Featuring Daddy Yankee)	1	2017-02-01	Luis Fonsi
Reggaetón Lento (Bailemos)	2	2017-02-01	CNCO
Chantaje (feat. Maluma)	3	2017-02-01	Shakira
Cuando Se Pone a Bailar	4	2017-02-01	Rombai
Otra vez (feat. J Balvin)	5	2017-02-01	Zion & Lennox
El Amante	6	2017-02-01	Nicky Jam
Shape of You	7	2017-02-01	Ed Sheeran
Vente Pa' Ca (feat. Maluma)	8	2017-02-01	Ricky Martin
Safari	9	2017-02-01	J Balvin, Pharrell Williams, BIA, Sky
Vacaciones	10	2017-02-01	Wisin

dbGetQuery(con, "
  select 
    artist, 
    count(distinct date) n_dates, 
    count(distinct title) n_songs
  from charts
  where chart = 'top200' and region = 'Sweden'
  group by artist
  order by n_dates desc
  limit 10
")

ABCDEFGHIJ0123456789

artist <chr>	n_dates <dbl>	n_songs <dbl>
Hov1	1664	46
Avicii	1540	52
Ed Sheeran	1514	59
Laleh	1484	18
Molly Sandén	1468	37
Journey	1431	1
Miriam Bryant	1291	18
James Arthur	1271	16
Post Malone	1249	39
Billie Eilish	1205	40

duckdb_charts <- tbl(con, "charts")
head(duckdb_charts)

ABCDEFGHIJ0123456789

title <chr>	rank <int>	date <date>	artist <chr>
Despacito (Featuring Daddy Yankee)	1	2017-02-01	Luis Fonsi
Reggaetón Lento (Bailemos)	2	2017-02-01	CNCO
Chantaje (feat. Maluma)	3	2017-02-01	Shakira
Cuando Se Pone a Bailar	4	2017-02-01	Rombai
Otra vez (feat. J Balvin)	5	2017-02-01	Zion & Lennox
El Amante	6	2017-02-01	Nicky Jam

duckdb_charts <- tbl(con, "charts")

duckdb_charts %>% 
  filter(chart == 'top200', region == 'Sweden') %>% 
  group_by(artist) %>% 
  summarise(n_dates = n_distinct(date), n_songs = n_distinct(title)) %>% 
  arrange(desc(n_dates)) %>% 
  head(10)

ABCDEFGHIJ0123456789

artist <chr>	n_dates <dbl>	n_songs <dbl>
Hov1	1664	46
Avicii	1540	52
Ed Sheeran	1514	59
Laleh	1484	18
Molly Sandén	1468	37
Journey	1431	1
Miriam Bryant	1291	18
James Arthur	1271	16
Post Malone	1249	39
Billie Eilish	1205	40

dbDisconnect(con, shutdown=TRUE)

R - spark

library(sparklyr)
library(tidyverse)
sc <- spark_connect(master = "local")

spark_charts <- spark_read_csv(sc, "charts", "data/charts.csv")
head(spark_charts)

ABCDEFGHIJ0123456789

title <chr>	rank <chr>	date <chr>	artist <chr>
Despacito (Featuring Daddy Yankee)	1	2017-02-01	Luis Fonsi
Reggaetón Lento (Bailemos)	2	2017-02-01	CNCO
Chantaje (feat. Maluma)	3	2017-02-01	Shakira
Cuando Se Pone a Bailar	4	2017-02-01	Rombai
Otra vez (feat. J Balvin)	5	2017-02-01	Zion & Lennox
El Amante	6	2017-02-01	Nicky Jam

spark_charts %>% 
  filter(chart == 'top200', region == 'Sweden') %>% 
  group_by(artist) %>% 
  summarise(n_dates = n_distinct(date), n_songs = n_distinct(title)) %>% 
  arrange(desc(n_dates)) %>% 
  head(10)

ABCDEFGHIJ0123456789

artist <chr>	n_dates <dbl>	n_songs <dbl>
Hov1	1664	46
Avicii	1540	52
Ed Sheeran	1514	59
Laleh	1484	18
Molly Sandén	1468	37
Journey	1431	1
Miriam Bryant	1291	18
James Arthur	1271	16
Post Malone	1249	39
Billie Eilish	1205	40

dbGetQuery(sc, "select * from charts limit 10")

ABCDEFGHIJ0123456789

title <chr>	rank <chr>	date <chr>	artist <chr>
Despacito (Featuring Daddy Yankee)	1	2017-02-01	Luis Fonsi
Reggaetón Lento (Bailemos)	2	2017-02-01	CNCO
Chantaje (feat. Maluma)	3	2017-02-01	Shakira
Cuando Se Pone a Bailar	4	2017-02-01	Rombai
Otra vez (feat. J Balvin)	5	2017-02-01	Zion & Lennox
El Amante	6	2017-02-01	Nicky Jam
Shape of You	7	2017-02-01	Ed Sheeran
Vente Pa' Ca (feat. Maluma)	8	2017-02-01	Ricky Martin
Safari	9	2017-02-01	J Balvin, Pharrell Williams, BIA, Sky
Vacaciones	10	2017-02-01	Wisin

dbGetQuery(sc, "
  select 
    artist, 
    count(distinct date) n_dates, 
    count(distinct title) n_songs
  from charts
  where chart = 'top200' and region = 'Sweden'
  group by artist
  order by n_dates desc
  limit 10
")

ABCDEFGHIJ0123456789

artist <chr>	n_dates <dbl>	n_songs <dbl>
Hov1	1664	46
Avicii	1540	52
Ed Sheeran	1514	59
Laleh	1484	18
Molly Sandén	1468	37
Journey	1431	1
Miriam Bryant	1291	18
James Arthur	1271	16
Post Malone	1249	39
Billie Eilish	1205	40

spark_disconnect(sc)

R - Snowflake

library("RJDBC")
library("DBI")

jdbcDriver <- JDBC(driverClass="com.snowflake.client.jdbc.SnowflakeDriver", 
                   classPath=Sys.getenv("SNOWFLAKE_CLASS_PATH"))
con <- dbConnect(jdbcDriver, "jdbc:snowflake://kc75442.us-east-1.snowflakecomputing.com", 
                 Sys.getenv("SNOWFLAKE_USER"),Sys.getenv("SNOWFLAKE_PASSWORD"))

dbSendQuery(con, "use role data_scientist")

## <JDBCResult>

dbSendQuery(con, "use schema analytics.ds")

## <JDBCResult>

dbGetQuery(con, "select * from charts limit 10")

ABCDEFGHIJ0123456789

	TITLE <chr>	RANK <dbl>	DATE <chr>	ARTIST <chr>
1	#Ysya2020 Vol. 6 - Un Flow de Infarto	29	2020-11-17	YSY A, Bizarrap
2	Reloj	30	2020-11-17	Rauw Alejandro, Anuel AA
3	Make A Wish (Birthday Song)	31	2020-11-17	NCT U
4	Bonita	32	2020-11-17	Jeeiph
5	Una Locura - Remix	33	2020-11-17	Bebe DJ, Maxi Jayat
6	La Tóxica	34	2020-11-17	Farruko
7	Why Not?	35	2020-11-17	LOONA
8	Sofia	36	2020-11-17	Clairo
9	Jeans	37	2020-11-17	Justin Quiles
10	Se Te Nota (with Guaynaa)	38	2020-11-17	Lele Pons

dbGetQuery(con, "
  select 
    artist, 
    count(distinct date) n_dates, 
    count(distinct title) n_songs
  from charts
  where chart = 'top200' and region = 'Sweden'
  group by artist
  order by n_dates desc
  limit 10
")

ABCDEFGHIJ0123456789

	ARTIST <chr>	N_DATES <dbl>	N_SONGS <dbl>
1	Hov1	1664	46
2	Avicii	1540	52
3	Ed Sheeran	1514	59
4	Laleh	1484	18
5	Molly Sandén	1468	37
6	Journey	1431	1
7	Miriam Bryant	1291	18
8	James Arthur	1271	16
9	Post Malone	1249	39
10	Billie Eilish	1205	40

dbGetQuery(con, "
  select *
  from charts
  where artist = 'Journey'
  order by date desc
  limit 100
")

ABCDEFGHIJ0123456789

	TITLE <chr>	RANK <dbl>	DATE <chr>	ARTIST <chr>
1	Don't Stop Believin'	103	2021-09-30	Journey
2	Don't Stop Believin'	196	2021-09-30	Journey
3	Don't Stop Believin'	101	2021-09-30	Journey
4	Don't Stop Believin'	137	2021-09-30	Journey
5	Don't Stop Believin'	193	2021-09-30	Journey
6	Don't Stop Believin'	173	2021-09-30	Journey
7	Don't Stop Believin'	139	2021-09-30	Journey
8	Don't Stop Believin'	125	2021-09-29	Journey
9	Don't Stop Believin'	149	2021-09-29	Journey
10	Don't Stop Believin'	184	2021-09-29	Journey

dbDisconnect(con)

## [1] TRUE