Exemplo New York City Taxi Trip Duration (191Mb)
usuário sistema decorrido
14.323 6.096 21.711
# Usando duckdb e dplyr (dbplyr instalado)
system.time({
con <- duckdb::dbConnect(duckdb::duckdb(), "../data/nyc-taxi.duckdb")
duckdb::duckdb_read_csv(con, "nyc-taxi", "../data/nyc-taxi.csv")
dplyr::tbl(con, "nyc-taxi") |>
dplyr::mutate(month = dplyr::sql("datepart('month', strptime(dropoff_datetime, '%Y-%m-%d %H:%M:%S'))")) |>
dplyr::group_by(month) |>
dplyr::summarise(`Média (s)` = mean(trip_duration, na.rm = TRUE))
duckdb::dbDisconnect(con, shutdown = TRUE)
})
usuário sistema decorrido
2.024 0.145 2.331
São 22 tipos de dados suportados
con <-
duckdb::dbConnect(duckdb::duckdb(), ":memory:")
dplyr::tibble(boolean = c(TRUE, TRUE, FALSE, TRUE),
double = c(-1.2, 5.65, 0.91, 100),
integer = c(3L, 20L, 0L, -2L),
timestamp = c("2023-04-01 12:13", "2023-05-30 01:45",
"2023-06-07 13:01", "2023-09-23 23:02") |> lubridate::ymd_hm(),
varchar = LETTERS[5:8]) |>
duckdb::dbWriteTable(con, "examples", value = _, overwrite = TRUE)
dplyr::tbl(con, "examples")
# Source: table<examples> [4 x 5]
# Database: DuckDB 0.7.1 [root@Darwin 22.4.0:R 4.2.3/:memory:]
boolean double integer timestamp varchar
<lgl> <dbl> <int> <dttm> <chr>
1 TRUE -1.2 3 2023-04-01 12:13:00 E
2 TRUE 5.65 20 2023-05-30 01:45:00 F
3 FALSE 0.91 0 2023-06-07 13:01:00 G
4 TRUE 100 -2 2023-09-23 23:02:00 H
São tipos LIST
, STRUCT
e MAP
con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
DBI::dbExecute(con, "CREATE TABLE NEST (int_list INT[], varchar_list VARCHAR[], struct STRUCT(i INT, j VARCHAR))")
[1] 0
Fazendo consulta:
csv
:parquet
:con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
DBI::dbGetQuery(con, "SELECT * FROM read_parquet('../data/nyc-taxi.parquet') LIMIT 2;") |> dplyr::as_tibble()
# A tibble: 2 × 11
id vendor_id pickup_datetime dropoff_datetime passenger_count
<chr> <int> <chr> <chr> <int>
1 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1
2 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1
# ℹ 6 more variables: pickup_longitude <dbl>, pickup_latitude <dbl>,
# dropoff_longitude <dbl>, dropoff_latitude <dbl>, store_and_fwd_flag <chr>,
# trip_duration <int>
json
:[
{"Name" : "Mario", "Age" : 32, "Occupation" : "Plumber"},
{"Name" : "Peach", "Age" : 21, "Occupation" : "Princess"},
{},
{"Name" : "Bowser", "Occupation" : "Koopa"}
]
[1] 0
[1] 0
Name Age Occupation
1 Mario 32 Plumber
2 Peach 21 Princess
3 <NA> NA <NA>
4 Bowser NA Koopa
df <- data.frame(repo = c("th1460/duckdb-ser"),
start_date = c("1984-10-19") |> as.Date())
con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
# Registra o df como uma tabela virtual (view)
duckdb::duckdb_register(con, "functions", df)
dplyr::tbl(con, "functions") |>
dplyr::mutate(extract = dplyr::sql("regexp_extract(repo, '[0-9]+')"),
weeks = dplyr::sql("datediff('week', start_date, today())"),
pi = dplyr::sql("pi()"))
# Source: SQL [1 x 5]
# Database: DuckDB 0.7.1 [root@Darwin 22.4.0:R 4.2.3/:memory:]
repo start_date extract weeks pi
<chr> <date> <chr> <dbl> <dbl>
1 th1460/duckdb-ser 1984-10-19 1460 2011 3.14
bible <- readr::read_lines(url("https://www.o-bible.com/download/kjv.txt"), skip = 1) |>
dplyr::as_tibble()
con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
duckdb::duckdb_register(con, "bible", bible)
(words <- dplyr::tbl(con, "bible") |>
dplyr::mutate(book = dplyr::sql("regexp_extract(regexp_extract(value, '\\w+\\d+\\:\\d+'), '[A-Za-z]+')"),
text = dplyr::sql("lcase(trim(regexp_replace(value, '\\w+\\d+\\:\\d+|\\;|\\,|\\.|\\:', '', 'g')))"),
word = dplyr::sql("regexp_split_to_array(text, '\\s')"),
word_clean = dplyr::sql("list_filter(word, x -> NOT regexp_matches(x, 'in|the|and'))")) |>
dplyr::select(book, text, word, word_clean) |> head(1) |> dplyr::as_tibble())
# A tibble: 1 × 4
book text word word_clean
<chr> <chr> <list> <list>
1 Ge in the beginning god created the heaven and the earth <chr> <chr [4]>
[[1]]
[1] "in" "the" "beginning" "god" "created" "the"
[7] "heaven" "and" "the" "earth"
[[1]]
[1] "god" "created" "heaven" "earth"
url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
read.csv(url, stringsAsFactors = FALSE) |>
dplyr::as_tibble()
# A tibble: 289 × 1,147
Province.State Country.Region Lat Long X1.22.20 X1.23.20 X1.24.20
<chr> <chr> <dbl> <dbl> <int> <int> <int>
1 "" Afghanistan 33.9 67.7 0 0 0
2 "" Albania 41.2 20.2 0 0 0
3 "" Algeria 28.0 1.66 0 0 0
4 "" Andorra 42.5 1.52 0 0 0
5 "" Angola -11.2 17.9 0 0 0
6 "" Antarctica -71.9 23.3 0 0 0
7 "" Antigua and B… 17.1 -61.8 0 0 0
8 "" Argentina -38.4 -63.6 0 0 0
9 "" Armenia 40.1 45.0 0 0 0
10 "Australian Capital T… Australia -35.5 149. 0 0 0
# ℹ 279 more rows
# ℹ 1,140 more variables: X1.25.20 <int>, X1.26.20 <int>, X1.27.20 <int>,
# X1.28.20 <int>, X1.29.20 <int>, X1.30.20 <int>, X1.31.20 <int>,
# X2.1.20 <int>, X2.2.20 <int>, X2.3.20 <int>, X2.4.20 <int>, X2.5.20 <int>,
# X2.6.20 <int>, X2.7.20 <int>, X2.8.20 <int>, X2.9.20 <int>, X2.10.20 <int>,
# X2.11.20 <int>, X2.12.20 <int>, X2.13.20 <int>, X2.14.20 <int>,
# X2.15.20 <int>, X2.16.20 <int>, X2.17.20 <int>, X2.18.20 <int>, …
# https://github.com/duckdb/duckdb/pull/6387 (0.8.0)
con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
duckdb::duckdb_register(con, "covid19", read.csv(url, stringsAsFactors = FALSE))
dplyr::tbl(con, dplyr::sql("(PIVOT_LONGER covid19 ON COLUMNS('X') INTO NAME date VALUE cumulate)")) |>
dplyr::select(country = Country.Region, date, cumulate) |>
dplyr::mutate(date = dplyr::sql("strptime(replace(date, 'X', ''), '%m.%d.%y')"),
value = cumulate - dplyr::lag(cumulate)) |>
dplyr::filter(date > "2020-02-23") |> head(3)
duckdb::dbDisconnect(con, shutdown = TRUE)
# Source: SQL [3 x 4]
# Database: DuckDB 0.7.2-dev2706 [root@Darwin 22.4.0:R 4.2.3/:memory:]
country date cumulate value
<chr> <dttm> <int> <int>
1 Afghanistan 2020-02-24 00:00:00 5 5
2 Afghanistan 2020-02-25 00:00:00 5 0
3 Afghanistan 2020-02-26 00:00:00 5 0
Neste exemplo será usado o Cloud Object Storage (COS) da IBM para armazenar um conjunto de dados em parquet
.
# Source: table<s3://duckdb-ser/nyc-taxi.parquet> [?? x 11]
# Database: DuckDB 0.7.1 [root@Darwin 22.4.0:R 4.2.3/:memory:]
id vendor_id pickup_datetime dropoff_datetime passenger_count
<chr> <int> <chr> <chr> <int>
1 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1
2 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1
3 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1
4 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1
5 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1
6 id0801584 2 2016-01-30 22:01:40 2016-01-30 22:09:03 6
7 id1813257 1 2016-06-17 22:34:59 2016-06-17 22:40:40 4
8 id1324603 2 2016-05-21 07:54:58 2016-05-21 08:20:49 1
9 id1301050 1 2016-05-27 23:12:23 2016-05-27 23:16:38 1
10 id0012891 2 2016-03-10 21:45:01 2016-03-10 22:05:26 1
# ℹ more rows
# ℹ 6 more variables: pickup_longitude <dbl>, pickup_latitude <dbl>,
# dropoff_longitude <dbl>, dropoff_latitude <dbl>, store_and_fwd_flag <chr>,
# trip_duration <int>
DBI::dbExecute(con, "INSTALL spatial;")
DBI::dbExecute(con, "LOAD spatial;")
nyc_taxi_spatial <- dplyr::tbl(con, "s3://duckdb-ser/nyc-taxi.parquet") |>
dplyr::mutate(pickup_point = dplyr::sql("ST_Transform(ST_Point(pickup_latitude, pickup_longitude), 'EPSG:4326', 'ESRI:102718')"),
dropoff_point = dplyr::sql("ST_Transform(ST_Point(dropoff_latitude, dropoff_longitude), 'EPSG:4326', 'ESRI:102718')"),
aerial_distance = dplyr::sql("ST_Distance(pickup_point, dropoff_point)/3280.84")) |> dplyr::as_tibble()
nyc_taxi_spatial |>
dplyr::select(pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, aerial_distance) |>
dplyr::slice(1) |>
tidyr::pivot_longer(tidyr::everything()) |>
dplyr::mutate(value = tibble::num(value, digits = 5))
# A tibble: 5 × 2
name value
<chr> <num:.5!>
1 pickup_longitude -73.98215
2 pickup_latitude 40.76794
3 dropoff_longitude -73.96463
4 dropoff_latitude 40.76560
5 aerial_distance 1.50216
#* Mostrar informações segundo ID
#* @param id
#* @get /info
function(id) {
# Ler variáveis de ambiente
readRenviron("../.Renviron")
# Criar conexão com o banco
con <- duckdb::dbConnect(duckdb::duckdb(), ":memory:")
invisible(DBI::dbExecute(con, "INSTALL httpfs;"))
invisible(DBI::dbExecute(con, "LOAD httpfs;"))
invisible(DBI::dbExecute(con, glue::glue("SET s3_region='{Sys.getenv('S3_REGION')}';")))
invisible(DBI::dbExecute(con, glue::glue("SET s3_endpoint='{Sys.getenv('S3_ENDPOINT')}';")))
invisible(DBI::dbExecute(con, glue::glue("SET s3_access_key_id='{Sys.getenv('S3_ACCESS_KEY_ID')}';")))
invisible(DBI::dbExecute(con, glue::glue("SET s3_secret_access_key='{Sys.getenv('S3_SECRET_ACCESS_KEY')}';")))
# Consulta
resposta <- dplyr::tbl(con, "s3://duckdb-ser/nyc-taxi.parquet") |>
dplyr::filter(id == input) |> dplyr::as_tibble() |> as.data.frame()
duckdb::dbDisconnect(con, shutdown = TRUE)
# Resultado
return(jsonlite::toJSON(resposta))
}
httr::GET("http://127.0.0.1:5830/info?id=id2875421") |>
httr::content() |>
jsonlite::toJSON(auto_unbox = TRUE)
["[{\"id\":\"id2875421\",\"vendor_id\":2,\"pickup_datetime\":\"2016-03-14 17:24:55\",\"dropoff_datetime\":\"2016-03-14 17:32:30\",\"passenger_count\":1,\"pickup_longitude\":-73.9822,\"pickup_latitude\":40.7679,\"dropoff_longitude\":-73.9646,\"dropoff_latitude\":40.7656,\"store_and_fwd_flag\":\"N\",\"trip_duration\":455}]"]
th1460.github.io
github.com/th1460
medium.com/@thopr
linkedin.com/in/thop