First some settings…

knitr::opts_chunk$set(warning = FALSE, message = FALSE) 

R - tidyverse

library(tidyverse)

charts <- read_csv("data/charts.csv", n_max = 100000)
head(charts)
ABCDEFGHIJ0123456789
title
<chr>
rank
<dbl>
date
<date>
artist
<chr>
Despacito (Featuring Daddy Yankee)12017-02-01Luis Fonsi
Reggaetón Lento (Bailemos)22017-02-01CNCO
Chantaje (feat. Maluma)32017-02-01Shakira
Cuando Se Pone a Bailar42017-02-01Rombai
Otra vez (feat. J Balvin)52017-02-01Zion & Lennox
El Amante62017-02-01Nicky Jam
charts %>% 
  filter(chart == 'top200', region == 'Sweden') %>% 
  group_by(artist) %>% 
  summarise(
    n_dates = n_distinct(date), 
    n_songs = n_distinct(title)) %>% 
  arrange(desc(n_dates)) %>% 
  head(10)
ABCDEFGHIJ0123456789
artist
<chr>
n_dates
<int>
n_songs
<int>
A R I Z O N A82
Adele85
AJR81
Alan Walker83
Alessia Cara82
Alok, Bruno Martini, Zeeba81
Ariana Grande, Nicki Minaj81
Bebe Rexha81
Big Sean81
Bruno Mars82

R - duckdb

library(DBI)
library(duckdb)
con = dbConnect(duckdb(), dbdir="duckdb_r_db")
dbExecute(con,"drop table if exists charts;")
## [1] 0
dbExecute(con, "
  create table charts as 
  select * from read_csv_auto('data/charts.csv');
")
## [1] 25001801
dbGetQuery(con, "select * from charts limit 10")
ABCDEFGHIJ0123456789
title
<chr>
rank
<int>
date
<date>
artist
<chr>
Despacito (Featuring Daddy Yankee)12017-02-01Luis Fonsi
Reggaetón Lento (Bailemos)22017-02-01CNCO
Chantaje (feat. Maluma)32017-02-01Shakira
Cuando Se Pone a Bailar42017-02-01Rombai
Otra vez (feat. J Balvin)52017-02-01Zion & Lennox
El Amante62017-02-01Nicky Jam
Shape of You72017-02-01Ed Sheeran
Vente Pa' Ca (feat. Maluma)82017-02-01Ricky Martin
Safari92017-02-01J Balvin, Pharrell Williams, BIA, Sky
Vacaciones102017-02-01Wisin
dbGetQuery(con, "
  select 
    artist, 
    count(distinct date) n_dates, 
    count(distinct title) n_songs
  from charts
  where chart = 'top200' and region = 'Sweden'
  group by artist
  order by n_dates desc
  limit 10
")
ABCDEFGHIJ0123456789
artist
<chr>
n_dates
<dbl>
n_songs
<dbl>
Hov1166446
Avicii154052
Ed Sheeran151459
Laleh148418
Molly Sandén146837
Journey14311
Miriam Bryant129118
James Arthur127116
Post Malone124939
Billie Eilish120540
duckdb_charts <- tbl(con, "charts")
head(duckdb_charts)
ABCDEFGHIJ0123456789
title
<chr>
rank
<int>
date
<date>
artist
<chr>
Despacito (Featuring Daddy Yankee)12017-02-01Luis Fonsi
Reggaetón Lento (Bailemos)22017-02-01CNCO
Chantaje (feat. Maluma)32017-02-01Shakira
Cuando Se Pone a Bailar42017-02-01Rombai
Otra vez (feat. J Balvin)52017-02-01Zion & Lennox
El Amante62017-02-01Nicky Jam
duckdb_charts <- tbl(con, "charts")

duckdb_charts %>% 
  filter(chart == 'top200', region == 'Sweden') %>% 
  group_by(artist) %>% 
  summarise(n_dates = n_distinct(date), n_songs = n_distinct(title)) %>% 
  arrange(desc(n_dates)) %>% 
  head(10)
ABCDEFGHIJ0123456789
artist
<chr>
n_dates
<dbl>
n_songs
<dbl>
Hov1166446
Avicii154052
Ed Sheeran151459
Laleh148418
Molly Sandén146837
Journey14311
Miriam Bryant129118
James Arthur127116
Post Malone124939
Billie Eilish120540
dbDisconnect(con, shutdown=TRUE)

R - spark

library(sparklyr)
library(tidyverse)
sc <- spark_connect(master = "local")
spark_charts <- spark_read_csv(sc, "charts", "data/charts.csv")
head(spark_charts)
ABCDEFGHIJ0123456789
title
<chr>
rank
<chr>
date
<chr>
artist
<chr>
Despacito (Featuring Daddy Yankee)12017-02-01Luis Fonsi
Reggaetón Lento (Bailemos)22017-02-01CNCO
Chantaje (feat. Maluma)32017-02-01Shakira
Cuando Se Pone a Bailar42017-02-01Rombai
Otra vez (feat. J Balvin)52017-02-01Zion & Lennox
El Amante62017-02-01Nicky Jam
spark_charts %>% 
  filter(chart == 'top200', region == 'Sweden') %>% 
  group_by(artist) %>% 
  summarise(n_dates = n_distinct(date), n_songs = n_distinct(title)) %>% 
  arrange(desc(n_dates)) %>% 
  head(10)
ABCDEFGHIJ0123456789
artist
<chr>
n_dates
<dbl>
n_songs
<dbl>
Hov1166446
Avicii154052
Ed Sheeran151459
Laleh148418
Molly Sandén146837
Journey14311
Miriam Bryant129118
James Arthur127116
Post Malone124939
Billie Eilish120540
dbGetQuery(sc, "select * from charts limit 10")
ABCDEFGHIJ0123456789
title
<chr>
rank
<chr>
date
<chr>
artist
<chr>
Despacito (Featuring Daddy Yankee)12017-02-01Luis Fonsi
Reggaetón Lento (Bailemos)22017-02-01CNCO
Chantaje (feat. Maluma)32017-02-01Shakira
Cuando Se Pone a Bailar42017-02-01Rombai
Otra vez (feat. J Balvin)52017-02-01Zion & Lennox
El Amante62017-02-01Nicky Jam
Shape of You72017-02-01Ed Sheeran
Vente Pa' Ca (feat. Maluma)82017-02-01Ricky Martin
Safari92017-02-01J Balvin, Pharrell Williams, BIA, Sky
Vacaciones102017-02-01Wisin
dbGetQuery(sc, "
  select 
    artist, 
    count(distinct date) n_dates, 
    count(distinct title) n_songs
  from charts
  where chart = 'top200' and region = 'Sweden'
  group by artist
  order by n_dates desc
  limit 10
")
ABCDEFGHIJ0123456789
artist
<chr>
n_dates
<dbl>
n_songs
<dbl>
Hov1166446
Avicii154052
Ed Sheeran151459
Laleh148418
Molly Sandén146837
Journey14311
Miriam Bryant129118
James Arthur127116
Post Malone124939
Billie Eilish120540
spark_disconnect(sc)

R - Snowflake

library("RJDBC")
library("DBI")

jdbcDriver <- JDBC(driverClass="com.snowflake.client.jdbc.SnowflakeDriver", 
                   classPath=Sys.getenv("SNOWFLAKE_CLASS_PATH"))
con <- dbConnect(jdbcDriver, "jdbc:snowflake://kc75442.us-east-1.snowflakecomputing.com", 
                 Sys.getenv("SNOWFLAKE_USER"),Sys.getenv("SNOWFLAKE_PASSWORD"))

dbSendQuery(con, "use role data_scientist")
## <JDBCResult>
dbSendQuery(con, "use schema analytics.ds")
## <JDBCResult>
dbGetQuery(con, "select * from charts limit 10") 
ABCDEFGHIJ0123456789
 
 
TITLE
<chr>
RANK
<dbl>
DATE
<chr>
ARTIST
<chr>
1#Ysya2020 Vol. 6 - Un Flow de Infarto292020-11-17YSY A, Bizarrap
2Reloj302020-11-17Rauw Alejandro, Anuel AA
3Make A Wish (Birthday Song)312020-11-17NCT U
4Bonita322020-11-17Jeeiph
5Una Locura - Remix332020-11-17Bebe DJ, Maxi Jayat
6La Tóxica342020-11-17Farruko
7Why Not?352020-11-17LOONA
8Sofia362020-11-17Clairo
9Jeans372020-11-17Justin Quiles
10Se Te Nota (with Guaynaa)382020-11-17Lele Pons
dbGetQuery(con, "
  select 
    artist, 
    count(distinct date) n_dates, 
    count(distinct title) n_songs
  from charts
  where chart = 'top200' and region = 'Sweden'
  group by artist
  order by n_dates desc
  limit 10
")
ABCDEFGHIJ0123456789
 
 
ARTIST
<chr>
N_DATES
<dbl>
N_SONGS
<dbl>
1Hov1166446
2Avicii154052
3Ed Sheeran151459
4Laleh148418
5Molly Sandén146837
6Journey14311
7Miriam Bryant129118
8James Arthur127116
9Post Malone124939
10Billie Eilish120540
dbGetQuery(con, "
  select *
  from charts
  where artist = 'Journey'
  order by date desc
  limit 100
")
ABCDEFGHIJ0123456789
 
 
TITLE
<chr>
RANK
<dbl>
DATE
<chr>
ARTIST
<chr>
1Don't Stop Believin'1032021-09-30Journey
2Don't Stop Believin'1962021-09-30Journey
3Don't Stop Believin'1012021-09-30Journey
4Don't Stop Believin'1372021-09-30Journey
5Don't Stop Believin'1932021-09-30Journey
6Don't Stop Believin'1732021-09-30Journey
7Don't Stop Believin'1392021-09-30Journey
8Don't Stop Believin'1252021-09-29Journey
9Don't Stop Believin'1492021-09-29Journey
10Don't Stop Believin'1842021-09-29Journey
dbDisconnect(con)
## [1] TRUE