R e duckdb

Thiago Pires

duckdb

Por que duckdb?


Simples

  • Embarcado
  • Sem dependências
  • APIs para R, Python, Julia, etc


Rápido

  • Optimizado para análises
  • Processamento de consultas paralelas

Rico em recursos

  • Amplo suporte ao SQL
  • Consulta direta em arquivos parquet e CSV


Gratuíto

  • Código aberto e gratuito
  • Licença MIT

Simples

Instalação

# Estável
install.packages("duckdb") 

# Desenvolvimento
install.packages('duckdb', repos=c('https://duckdb.r-universe.dev', 'https://cloud.r-project.org'))


Versão

packageVersion("duckdb")
[1] '0.7.1.1'

Rápido

Comparativo

Exemplo New York City Taxi Trip Duration (191Mb)

# Usando R base e dplyr 
system.time({
  read.csv("../data/nyc-taxi.csv") |>
    dplyr::mutate(month = lubridate::month(dropoff_datetime)) |>
    dplyr::group_by(month) |>
    dplyr::summarise(`Média (s)` = mean(trip_duration, na.rm = TRUE))
})
usuário   sistema decorrido 
 14.323     6.096    21.711 
# Usando duckdb e dplyr (dbplyr instalado)
system.time({
  con <- duckdb::dbConnect(duckdb::duckdb(), "../data/nyc-taxi.duckdb")
  duckdb::duckdb_read_csv(con, "nyc-taxi", "../data/nyc-taxi.csv")
  dplyr::tbl(con, "nyc-taxi") |>
    dplyr::mutate(month = dplyr::sql("datepart('month', strptime(dropoff_datetime, '%Y-%m-%d %H:%M:%S'))")) |>
    dplyr::group_by(month) |>
    dplyr::summarise(`Média (s)` = mean(trip_duration, na.rm = TRUE))
  duckdb::dbDisconnect(con, shutdown = TRUE)
})
usuário   sistema decorrido 
  2.024     0.145     2.331

Recursos

Tipos de dado

São 22 tipos de dados suportados

con <- 
  duckdb::dbConnect(duckdb::duckdb(), ":memory:")

dplyr::tibble(boolean = c(TRUE, TRUE, FALSE, TRUE),
              double = c(-1.2, 5.65, 0.91, 100),
              integer = c(3L, 20L, 0L, -2L),
              timestamp = c("2023-04-01 12:13", "2023-05-30 01:45", 
                            "2023-06-07 13:01", "2023-09-23 23:02") |> lubridate::ymd_hm(),
              varchar = LETTERS[5:8]) |>
  duckdb::dbWriteTable(con, "examples", value = _, overwrite = TRUE)

dplyr::tbl(con, "examples")
# Source:   table<examples> [4 x 5]
# Database: DuckDB 0.7.1 [root@Darwin 22.4.0:R 4.2.3/:memory:]
  boolean double integer timestamp           varchar
  <lgl>    <dbl>   <int> <dttm>              <chr>  
1 TRUE     -1.2        3 2023-04-01 12:13:00 E      
2 TRUE      5.65      20 2023-05-30 01:45:00 F      
3 FALSE     0.91       0 2023-06-07 13:01:00 G      
4 TRUE    100         -2 2023-09-23 23:02:00 H      
duckdb::dbDisconnect(con, shutdown = TRUE)

Tipos aninhados


São tipos LIST, STRUCT e MAP

con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
DBI::dbExecute(con, "CREATE TABLE NEST (int_list INT[], varchar_list VARCHAR[], struct STRUCT(i INT, j VARCHAR))")
[1] 0
stmt <- DBI::dbSendStatement(con, "INSERT INTO NEST VALUES (?, ?, ?)")
DBI::dbBind(stmt, list("[1, 2]", "['a', 'b']", "{'i': 5, 'j': 'c'}"))


Fazendo consulta:

dplyr::tbl(con, "nest")
# Source:   table<nest> [1 x 3]
# Database: DuckDB 0.7.1 [root@Darwin 22.4.0:R 4.2.3/:memory:]
  int_list  varchar_list struct$i $j   
  <list>    <list>          <int> <chr>
1 <int [2]> <chr [2]>           5 c    
duckdb::dbDisconnect(con, shutdown = TRUE)

Leitura e escrita de arquivos em diferentes formatos

  • csv:
con <- duckdb::dbConnect(duckdb::duckdb(), "../data/nyc-taxi.duckdb")
duckdb::duckdb_read_csv(con, "nyc-taxi", "../data/nyc-taxi.csv")

# Salvar em parquet
DBI::dbExecute(con, "COPY 'nyc-taxi' TO '../data/nyc-taxi.parquet' (FORMAT PARQUET);")
duckdb::dbDisconnect(con, shutdown = TRUE)
  • parquet:
con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
DBI::dbGetQuery(con, "SELECT * FROM read_parquet('../data/nyc-taxi.parquet') LIMIT 2;") |> dplyr::as_tibble()
# A tibble: 2 × 11
  id        vendor_id pickup_datetime     dropoff_datetime    passenger_count
  <chr>         <int> <chr>               <chr>                         <int>
1 id2875421         2 2016-03-14 17:24:55 2016-03-14 17:32:30               1
2 id2377394         1 2016-06-12 00:43:35 2016-06-12 00:54:38               1
# ℹ 6 more variables: pickup_longitude <dbl>, pickup_latitude <dbl>,
#   dropoff_longitude <dbl>, dropoff_latitude <dbl>, store_and_fwd_flag <chr>,
#   trip_duration <int>
duckdb::dbDisconnect(con, shutdown = TRUE)

Leitura e escrita de arquivos em diferentes formatos

  • json:
[
  {"Name" : "Mario", "Age" : 32, "Occupation" : "Plumber"}, 
  {"Name" : "Peach", "Age" : 21, "Occupation" : "Princess"},
  {},
  {"Name" : "Bowser", "Occupation" : "Koopa"}
]


con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
DBI::dbExecute(con, "INSTALL json;")
[1] 0
DBI::dbExecute(con, "LOAD json;")
[1] 0
DBI::dbGetQuery(con, "SELECT * FROM read_json_auto('../data/example.json')")
    Name Age Occupation
1  Mario  32    Plumber
2  Peach  21   Princess
3   <NA>  NA       <NA>
4 Bowser  NA      Koopa
duckdb::dbDisconnect(con, shutdown = TRUE)

Funções

df <- data.frame(repo = c("th1460/duckdb-ser"),
                 start_date = c("1984-10-19") |> as.Date())
con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")

# Registra o df como uma tabela virtual (view)
duckdb::duckdb_register(con, "functions", df) 

dplyr::tbl(con, "functions") |>
  dplyr::mutate(extract = dplyr::sql("regexp_extract(repo, '[0-9]+')"),
                weeks = dplyr::sql("datediff('week', start_date, today())"),
                pi = dplyr::sql("pi()"))
# Source:   SQL [1 x 5]
# Database: DuckDB 0.7.1 [root@Darwin 22.4.0:R 4.2.3/:memory:]
  repo              start_date extract weeks    pi
  <chr>             <date>     <chr>   <dbl> <dbl>
1 th1460/duckdb-ser 1984-10-19 1460     2011  3.14
duckdb::dbDisconnect(con, shutdown = TRUE)

Casos de uso: Text mining

bible <- readr::read_lines(url("https://www.o-bible.com/download/kjv.txt"), skip = 1) |>
  dplyr::as_tibble()

con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
duckdb::duckdb_register(con, "bible", bible)
(words <- dplyr::tbl(con, "bible") |>
  dplyr::mutate(book = dplyr::sql("regexp_extract(regexp_extract(value, '\\w+\\d+\\:\\d+'), '[A-Za-z]+')"),
                text = dplyr::sql("lcase(trim(regexp_replace(value, '\\w+\\d+\\:\\d+|\\;|\\,|\\.|\\:', '', 'g')))"),
                word = dplyr::sql("regexp_split_to_array(text, '\\s')"),
                word_clean = dplyr::sql("list_filter(word, x -> NOT regexp_matches(x, 'in|the|and'))")) |>
  dplyr::select(book, text, word, word_clean) |> head(1) |> dplyr::as_tibble())
# A tibble: 1 × 4
  book  text                                                  word   word_clean
  <chr> <chr>                                                 <list> <list>    
1 Ge    in the beginning god created the heaven and the earth <chr>  <chr [4]> 
words$word
[[1]]
 [1] "in"        "the"       "beginning" "god"       "created"   "the"      
 [7] "heaven"    "and"       "the"       "earth"    
words$word_clean
[[1]]
[1] "god"     "created" "heaven"  "earth"  
duckdb::dbDisconnect(con, shutdown = TRUE)

Casos de uso: Dados de COVID 19

url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"

read.csv(url, stringsAsFactors = FALSE) |>
  dplyr::as_tibble()
# A tibble: 289 × 1,147
   Province.State         Country.Region   Lat   Long X1.22.20 X1.23.20 X1.24.20
   <chr>                  <chr>          <dbl>  <dbl>    <int>    <int>    <int>
 1 ""                     Afghanistan     33.9  67.7         0        0        0
 2 ""                     Albania         41.2  20.2         0        0        0
 3 ""                     Algeria         28.0   1.66        0        0        0
 4 ""                     Andorra         42.5   1.52        0        0        0
 5 ""                     Angola         -11.2  17.9         0        0        0
 6 ""                     Antarctica     -71.9  23.3         0        0        0
 7 ""                     Antigua and B…  17.1 -61.8         0        0        0
 8 ""                     Argentina      -38.4 -63.6         0        0        0
 9 ""                     Armenia         40.1  45.0         0        0        0
10 "Australian Capital T… Australia      -35.5 149.          0        0        0
# ℹ 279 more rows
# ℹ 1,140 more variables: X1.25.20 <int>, X1.26.20 <int>, X1.27.20 <int>,
#   X1.28.20 <int>, X1.29.20 <int>, X1.30.20 <int>, X1.31.20 <int>,
#   X2.1.20 <int>, X2.2.20 <int>, X2.3.20 <int>, X2.4.20 <int>, X2.5.20 <int>,
#   X2.6.20 <int>, X2.7.20 <int>, X2.8.20 <int>, X2.9.20 <int>, X2.10.20 <int>,
#   X2.11.20 <int>, X2.12.20 <int>, X2.13.20 <int>, X2.14.20 <int>,
#   X2.15.20 <int>, X2.16.20 <int>, X2.17.20 <int>, X2.18.20 <int>, …

Casos de uso: Dados de COVID 19

# https://github.com/duckdb/duckdb/pull/6387 (0.8.0)
con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
duckdb::duckdb_register(con, "covid19", read.csv(url, stringsAsFactors = FALSE))

dplyr::tbl(con, dplyr::sql("(PIVOT_LONGER covid19 ON COLUMNS('X') INTO NAME date VALUE cumulate)")) |>
  dplyr::select(country = Country.Region, date, cumulate) |>
  dplyr::mutate(date = dplyr::sql("strptime(replace(date, 'X', ''), '%m.%d.%y')"),
                value = cumulate - dplyr::lag(cumulate)) |>
  dplyr::filter(date > "2020-02-23") |> head(3)
  
duckdb::dbDisconnect(con, shutdown = TRUE)
# Source:   SQL [3 x 4]
# Database: DuckDB 0.7.2-dev2706 [root@Darwin 22.4.0:R 4.2.3/:memory:]
  country     date                cumulate value
  <chr>       <dttm>                 <int> <int>
1 Afghanistan 2020-02-24 00:00:00        5     5
2 Afghanistan 2020-02-25 00:00:00        5     0
3 Afghanistan 2020-02-26 00:00:00        5     0

Lendo dados do S3 (COS IBM)

Neste exemplo será usado o Cloud Object Storage (COS) da IBM para armazenar um conjunto de dados em parquet.

readRenviron(".Renviron")

con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
DBI::dbExecute(con, "INSTALL httpfs;") 
[1] 0
DBI::dbExecute(con, "LOAD httpfs;") 
[1] 0
DBI::dbExecute(con, glue::glue("SET s3_region='{Sys.getenv('S3_REGION')}';"))
[1] 0
DBI::dbExecute(con, glue::glue("SET s3_endpoint='{Sys.getenv('S3_ENDPOINT')}';"))
[1] 0
DBI::dbExecute(con, glue::glue("SET s3_access_key_id='{Sys.getenv('S3_ACCESS_KEY_ID')}';"))
[1] 0
DBI::dbExecute(con, glue::glue("SET s3_secret_access_key='{Sys.getenv('S3_SECRET_ACCESS_KEY')}';"))
[1] 0

Lendo dados do S3 (COS IBM)

dplyr::tbl(con, "s3://duckdb-ser/nyc-taxi.parquet")
# Source:   table<s3://duckdb-ser/nyc-taxi.parquet> [?? x 11]
# Database: DuckDB 0.7.1 [root@Darwin 22.4.0:R 4.2.3/:memory:]
   id        vendor_id pickup_datetime     dropoff_datetime    passenger_count
   <chr>         <int> <chr>               <chr>                         <int>
 1 id2875421         2 2016-03-14 17:24:55 2016-03-14 17:32:30               1
 2 id2377394         1 2016-06-12 00:43:35 2016-06-12 00:54:38               1
 3 id3858529         2 2016-01-19 11:35:24 2016-01-19 12:10:48               1
 4 id3504673         2 2016-04-06 19:32:31 2016-04-06 19:39:40               1
 5 id2181028         2 2016-03-26 13:30:55 2016-03-26 13:38:10               1
 6 id0801584         2 2016-01-30 22:01:40 2016-01-30 22:09:03               6
 7 id1813257         1 2016-06-17 22:34:59 2016-06-17 22:40:40               4
 8 id1324603         2 2016-05-21 07:54:58 2016-05-21 08:20:49               1
 9 id1301050         1 2016-05-27 23:12:23 2016-05-27 23:16:38               1
10 id0012891         2 2016-03-10 21:45:01 2016-03-10 22:05:26               1
# ℹ more rows
# ℹ 6 more variables: pickup_longitude <dbl>, pickup_latitude <dbl>,
#   dropoff_longitude <dbl>, dropoff_latitude <dbl>, store_and_fwd_flag <chr>,
#   trip_duration <int>
duckdb::dbDisconnect(con, shutdown = TRUE)

Dados espaciais

DBI::dbExecute(con, "INSTALL spatial;")
DBI::dbExecute(con, "LOAD spatial;")

nyc_taxi_spatial <- dplyr::tbl(con, "s3://duckdb-ser/nyc-taxi.parquet") |>
  dplyr::mutate(pickup_point = dplyr::sql("ST_Transform(ST_Point(pickup_latitude, pickup_longitude), 'EPSG:4326', 'ESRI:102718')"),
                dropoff_point = dplyr::sql("ST_Transform(ST_Point(dropoff_latitude, dropoff_longitude), 'EPSG:4326', 'ESRI:102718')"),
                aerial_distance = dplyr::sql("ST_Distance(pickup_point, dropoff_point)/3280.84")) |> dplyr::as_tibble()

nyc_taxi_spatial |> 
  dplyr::select(pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, aerial_distance) |> 
  dplyr::slice(1) |>
  tidyr::pivot_longer(tidyr::everything()) |> 
  dplyr::mutate(value = tibble::num(value, digits = 5))
# A tibble: 5 × 2
  name                  value
  <chr>             <num:.5!>
1 pickup_longitude  -73.98215
2 pickup_latitude    40.76794
3 dropoff_longitude -73.96463
4 dropoff_latitude   40.76560
5 aerial_distance     1.50216

Um banco embarcado

Exemplo de uma API com plumber

#* Mostrar informações segundo ID
#* @param id
#* @get /info
function(id) {
    # Ler variáveis de ambiente
    readRenviron("../.Renviron")

    # Criar conexão com o banco
    con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
    invisible(DBI::dbExecute(con, "INSTALL httpfs;"))
    invisible(DBI::dbExecute(con, "LOAD httpfs;"))
    invisible(DBI::dbExecute(con, glue::glue("SET s3_region='{Sys.getenv('S3_REGION')}';")))
    invisible(DBI::dbExecute(con, glue::glue("SET s3_endpoint='{Sys.getenv('S3_ENDPOINT')}';")))
    invisible(DBI::dbExecute(con, glue::glue("SET s3_access_key_id='{Sys.getenv('S3_ACCESS_KEY_ID')}';")))
    invisible(DBI::dbExecute(con, glue::glue("SET s3_secret_access_key='{Sys.getenv('S3_SECRET_ACCESS_KEY')}';")))

    # Consulta
    resposta <- dplyr::tbl(con, "s3://duckdb-ser/nyc-taxi.parquet") |>
        dplyr::filter(id == input) |> dplyr::as_tibble() |> as.data.frame()

    duckdb::dbDisconnect(con, shutdown = TRUE)

    # Resultado
    return(jsonlite::toJSON(resposta))
}

Um banco embarcado

Resposta da API

httr::GET("http://127.0.0.1:5830/info?id=id2875421") |> 
  httr::content() |> 
  jsonlite::toJSON(auto_unbox = TRUE)
["[{\"id\":\"id2875421\",\"vendor_id\":2,\"pickup_datetime\":\"2016-03-14 17:24:55\",\"dropoff_datetime\":\"2016-03-14 17:32:30\",\"passenger_count\":1,\"pickup_longitude\":-73.9822,\"pickup_latitude\":40.7679,\"dropoff_longitude\":-73.9646,\"dropoff_latitude\":40.7656,\"store_and_fwd_flag\":\"N\",\"trip_duration\":455}]"] 

Obrigado

Contato

th1460.github.io
github.com/th1460
medium.com/@thopr
linkedin.com/in/thop