Linguagem R

Além do #rstats

Thiago Pires

Thiago Pires

Cientista de Dados (CIO/IBM)

Formação:


  • Estatística (UERJ)
  • MSc. Epidemiologia (ENSP/FIOCRUZ)
  • DSc. Engenharia Biomédica (COPPE/UFRJ)

Linguagem R

  • A linguagem R foi derivada da linguagem S.
  • Foi lançada em 1993 pelos dois estatísticos Ross Ihaka e Robert Gentleman da Universidade de Auckland, Nova Zelândia
  • Atualmente na versão 4.2

Linguagem R

Sintaxe

# x recebe um inteiro
x <- 4

# y recebe uma string
y <- "versão"

# Função que concatena
paste(y, x)
[1] "versão 4"
# Criando funções
f <- function(x) {
    x^2
}

f(-4:4)
[1] 16  9  4  1  0  1  4  9 16
# Utilizando pipe (versão 4.1+)
-4:4 |> f()
[1] 16  9  4  1  0  1  4  9 16
# Trabalhando com dados
dados <- data.frame(x = c(4, 3, 5),
                    y = letters[3:5])
dados
  x y
1 4 c
2 3 d
3 5 e
# Criando variáveis
dados$z <- dados$x |> f()
dados
  x y  z
1 4 c 16
2 3 d  9
3 5 e 25
# Calculando medidas
dados$z |> mean()
[1] 16.66667

Linguagem R

Sintaxe

# Criando listas
list(nome = c("Thiago", "Pires"), 
     idade = 37,
     skills = data.frame(skill = c("R", "Python"), 
                         anos = c(15, 3)))
$nome
[1] "Thiago" "Pires" 

$idade
[1] 37

$skills
   skill anos
1      R   15
2 Python    3
# Criando gráficos
x <- seq(-5, 5, .1)
y <- f(x)

plot(x, y, type = "l", col = "blue")

Tidyverse e RStudio

Tidyverse é uma coleção de pacotes desenvolvidos pela RStudio e projetados para ciência de dados. Todos estes pacotes tentam compartilhar uma mesma sintaxe e estrutura de dados.

dplyr:: utilizado na manipulação de dados

Média, desvio-padrão e N da idade, segundo sexo

library(dplyr)

titanic::titanic_train |> 
    filter(Survived == 0) |> 
    group_by(Sex) |> 
    summarise(`Média` = mean(Age, na.rm = TRUE),
              `Desvio-padrão` = sd(Age, na.rm = TRUE),
              N = n()) |> 
    mutate(across(c(2, 3), \(x) round(x, 1)))
# A tibble: 2 × 4
  Sex    Média `Desvio-padrão`     N
  <chr>  <dbl>           <dbl> <int>
1 female  25              13.6    81
2 male    31.6            14.1   468

Percentual de mortos, segundo sexo

titanic::titanic_train |>
    count(Survived, Sex) |>
    group_by(Sex) |>
    mutate(`%` = round(n/sum(n) * 100, 1)) |>
    filter(Survived == 0)
# A tibble: 2 × 4
# Groups:   Sex [2]
  Survived Sex        n   `%`
     <int> <chr>  <int> <dbl>
1        0 female    81  25.8
2        0 male     468  81.1

ggplot2:: utilizado na visualização de dados

Percentual de mortos e vivos, segundo sexo e classe

library(ggplot2)

titanic::titanic_train |> 
    ggplot() +
    aes(Sex, ..count../sum(..count..), 
        group = Survived, 
        fill = as.factor(Survived)) +
    geom_bar(position = "fill") +
    facet_grid(~Pclass) +
    scale_y_continuous(labels = scales::percent) +
    scale_fill_manual(values = c("#003f9a", "#5099f4"), 
                      labels = c("não", "sim")) +
    labs(x = "Sexo", y = "", fill = "Sobreviveu") +
    theme_light() + 
    theme(text = element_text(size = 24))

stringr:: e forcats::

titanic::titanic_train$Name[4]
[1] "Futrelle, Mrs. Jacques Heath (Lily May Peel)"
titanic::titanic_train |> 
    dplyr::mutate(
        Title = Name |> 
            stringr::str_extract(
                "(?<=\\,\\s)(.*)(?=\\.)") |> 
            forcats::fct_lump(n = 4)) |>  
    ggplot() + 
    aes(Title, Age) +
    geom_boxplot() +
    theme(text = element_text(size = 24))

readr::, tidyr:: e lubridate:: para mais manipulações

url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"

covid19 <- 
    readr::read_csv(url) |> 
    dplyr::filter(`Country/Region` == "Brazil") |> 
    dplyr::select(- c(`Province/State`, Lat, Long))

covid19
# A tibble: 1 × 976
  Country/Regi…¹ 1/22/…² 1/23/…³ 1/24/…⁴ 1/25/…⁵ 1/26/…⁶ 1/27/…⁷ 1/28/…⁸ 1/29/…⁹
  <chr>            <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 Brazil               0       0       0       0       0       0       0       0
# … with 967 more variables: `1/30/20` <dbl>, `1/31/20` <dbl>, `2/1/20` <dbl>,
#   `2/2/20` <dbl>, `2/3/20` <dbl>, `2/4/20` <dbl>, `2/5/20` <dbl>,
#   `2/6/20` <dbl>, `2/7/20` <dbl>, `2/8/20` <dbl>, `2/9/20` <dbl>,
#   `2/10/20` <dbl>, `2/11/20` <dbl>, `2/12/20` <dbl>, `2/13/20` <dbl>,
#   `2/14/20` <dbl>, `2/15/20` <dbl>, `2/16/20` <dbl>, `2/17/20` <dbl>,
#   `2/18/20` <dbl>, `2/19/20` <dbl>, `2/20/20` <dbl>, `2/21/20` <dbl>,
#   `2/22/20` <dbl>, `2/23/20` <dbl>, `2/24/20` <dbl>, `2/25/20` <dbl>, …
# ℹ Use `colnames()` to see all variable names
covid19 <- 
    covid19 |> 
    tidyr::pivot_longer(!`Country/Region`, 
                        names_to = "date", 
                        values_to = "cumulate") |> 
    dplyr::group_by(`Country/Region`) |> 
    dplyr::mutate(date = lubridate::mdy(date),
                  value = cumulate - dplyr::lag(cumulate)) |> 
    dplyr::rename(country = `Country/Region`)

covid19 |> head(5)
# A tibble: 5 × 4
# Groups:   country [1]
  country date       cumulate value
  <chr>   <date>        <dbl> <dbl>
1 Brazil  2020-01-22        0    NA
2 Brazil  2020-01-23        0     0
3 Brazil  2020-01-24        0     0
4 Brazil  2020-01-25        0     0
5 Brazil  2020-01-26        0     0

ggplot2:: na visualização de casos de COVID 19

covid19 |> 
    dplyr::mutate(moving_average = zoo::rollmean(value, 7, align = "right", fill = NA)) |> 
    ggplot2::ggplot() +
    ggplot2::aes(date, moving_average) +
    ggplot2::geom_line() +
    ggplot2::theme_minimal() +
    ggplot2::labs(x = "Data", 
                  y = "Média móvel casos COVID 19") +
    theme(text = element_text(size = 22))

plotly:: gráficos dinâmicos

covid19 |> 
    dplyr::mutate(moving_average = zoo::rollmean(value, 7, align = "right", fill = NA)) |> 
    plotly::plot_ly(x = ~date, y = ~moving_average, 
                    type = "scatter", mode = "lines") |> 
    plotly::layout(xaxis = list(title = "Date"),
                   yaxis = list(title = "Média móvel casos COVID 19"),
                   font = list(size = 20), margin = list(t = 100)) |> 
    plotly::rangeslider() |> 
    widgetframe::frameWidget()

leaflet:: mapas dinâmicos

library(leaflet)

locale <- dplyr::tibble(label = "IBM Hortolândia",
                        lat = -22.8996401,
                        lng = -47.2032362,
                        logo = "https://www.ibm.com/brand/experience-guides/developer/8f4e3cc2b5d52354a6d43c8edba1e3c9/02_8-bar-reverse.svg")

leaflet(options = list(closePopupOnClick = FALSE)) |> 
    setView(lng = -46.9, 
            lat = -22.8, 
            zoom = 7) |> 
    addTiles() |> 
    addPopups(data = locale, 
              ~lng, 
              ~lat,
              options = popupOptions(closeButton = FALSE), 
              popup = ~paste("<img src=", 
                             logo, "width=100%> <br>", 
                             label))

leaflet:: rota do Titanic

events <- bind_rows(
    tibble(location = "Southampton (10-04-1912)", 
           lng = -1.4191, lat = 50.7894),
    tibble(location = "Cherbourg (10-04-1912)", 
           lng = -1.6109, lat = 49.6445),
    tibble(location = "Queenstown (11-04-1912)", 
           lng = -8.3211, lat = 51.8535),
    tibble(location = "Naufrágio (14-04-1912)", 
           lng = -49.9408, lat = 41.7258),
    tibble(location = "New York", 
           lng = -73.9655, lat = 40.6832))

leaflet() |> 
    setView(lng = -33.9, 
            lat = 46.8, 
            zoom = 3) |> 
    addTiles() |> 
    addCircleMarkers(data = events |> slice(1:3, 5), 
                     label = ~location,
                     color = c(rep("blue", 3), "green")) |> 
    addMarkers(data = events |> slice(4), 
               icon = list(
                   iconUrl = "resources/images/sinking-ship.jpeg", 
                   iconSize = c(50, 50)), 
               label = ~location) |> 
    addPolylines(data = events, ~lng, ~lat) |> 
    widgetframe::frameWidget()

DBI::, RJDBC:: e dbplyr:: conectando com o DB2 na IBM Cloud

library(dbplyr)

# Ler variáveis de ambiente
readRenviron("../.Renviron") 

# Conexão com o DB2
drv <-
    RJDBC::JDBC("com.ibm.db2.jcc.DB2Driver", 
                "../jars/db2jcc4.jar")

host <- Sys.getenv("DB2_HOST")
user <- Sys.getenv("DB2_USER")
password <- Sys.getenv("DB2_PASSWORD")

uri <- 
    sprintf("jdbc:db2://%s/bludb:user=%s;password=%s;sslConnection=true;", 
            host, user, password)

db2 <-
    DBI::dbConnect(drv, uri)

# Enviando tabela para o DB2
DBI::dbWriteTable(db2, "COVID19", 
                  value = na.omit(covid19), 
                  overwrite = TRUE)

# Fazendo consulas no DB2
dplyr::tbl(db2, "COVID19") |> 
    dplyr::filter(dplyr::between(DATE, 
                                 "2022-06-07", 
                                 "2022-06-15")) |> 
    dplyr::select(COUNTRY, DATE, VALUE)
# Source:   SQL [9 x 3]
# Database: JDBCConnection
  COUNTRY DATE       VALUE
  <chr>   <chr>      <dbl>
1 Brazil  2022-06-07 71045
2 Brazil  2022-06-08 49614
3 Brazil  2022-06-09 45073
4 Brazil  2022-06-10 56491
5 Brazil  2022-06-11 27796
6 Brazil  2022-06-12 11728
7 Brazil  2022-06-13 40173
8 Brazil  2022-06-14 44441
9 Brazil  2022-06-15 70290

DBI::, RJDBC:: e dbplyr:: conectando com o DB2 na IBM Cloud

dplyr::tbl(db2, "COVID19") |> 
    dplyr::filter(dplyr::between(DATE, 
                                 "2022-06-07", 
                                 "2022-06-15")) |> 
    dplyr::select(COUNTRY, DATE, VALUE) |> 
    dplyr::show_query()
<SQL>
SELECT "COUNTRY", "DATE", "VALUE"
FROM "COVID19"
WHERE ("DATE" BETWEEN '2022-06-07' AND '2022-06-15')

tidymodels:: aprendizado supervisionado

library(tidymodels)
set.seed(555)

titanic <- 
    titanic::titanic_train |> 
    dplyr::mutate(Pclass = relevel(factor(Pclass, labels = c("1st", "2nd", "3rd")),
                                   ref = "3rd"),
                  Survived = factor(Survived, labels = c("não", "sim")),
                  Sex = factor(Sex, levels = c("male", "female")))

# Separar 3/4 dos dados para treino 
data_split <- 
    initial_split(titanic, prop = 3/4)

train_data <- training(data_split)
test_data  <- testing(data_split)

# Ajuste do modelo
lr_mod <- 
    logistic_reg() |>  
    set_engine("glm")

lr_fit <- 
    lr_mod |>  
    fit(Survived ~ Sex + Pclass, data = train_data)

lr_fit |> broom::tidy() |> dplyr::mutate(exp(estimate))
# A tibble: 4 × 6
  term        estimate std.error statistic  p.value `exp(estimate)`
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>           <dbl>
1 (Intercept)    -2.42     0.193    -12.5  8.35e-36          0.0893
2 Sexfemale       2.75     0.219     12.6  2.89e-36         15.6   
3 Pclass1st       1.98     0.253      7.82 5.28e-15          7.22  
4 Pclass2nd       1.19     0.261      4.56 5.01e- 6          3.29  
  • A chance de uma mulher sobreviver é 15 vezes a chance de um homem sobreviver
  • A chance de uma pessoa da 1st classe sobreviver é 7 vezes a chance de um pessoa da 3rd classe sobreviver
  • A chance de uma pessoa da 2nd classe sobreviver é 3 vezes a chance de um pessoa da 3rd classe sobreviver

tidymodels:: visualizando o modelo

newdata <- 
    expand.grid(Pclass = c("1st", "2nd", "3rd"),
                Sex = c("male", "female"))
pihat <- 
    (lr_fit |> predict(newdata, type = "prob")) |> 
    pull(.pred_sim)

newdata |> mutate(Pihat = pihat) |>  
    ggplot(aes(Sex, Pihat, 
               group = Pclass, 
               colour = Pclass)) + 
    geom_line() + geom_point() +
    labs(x = "Sex", y = expression(pi(Survived == sim)), 
         colour = "Ticket Class") +
    theme_minimal() +
    theme(text = element_text(size = 22))

tidymodels:: avaliando o modelo

measure <- function(data) {
    
    data |> 
        accuracy(truth = Survived, .pred_class) |>  
        
        bind_rows(
            data |>  
                f_meas(truth = Survived, .pred_class))
}

predict(lr_fit, test_data) |> 
    bind_cols(predict(lr_fit, 
                      test_data, type = "prob")) |> 
    bind_cols(test_data |>  select(Survived)) |> 
    measure()
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.762
2 f_meas   binary         0.803

pins:: versionando recursos no cloud object storage (COS) na IBM Cloud

Versionando o modelo selecionado para a predição da probabilidade de sobreviver ao desastre do titanic.

# Obtendo o modelo
saved_lr_fit <- tidypredict::parse_model(lr_fit)

# Conexão com o COS
board <- pins::board_s3(bucket = Sys.getenv("COS_BUCKET"),
                        region = Sys.getenv("COS_REGION"),
                        access_key = Sys.getenv("COS_ACCESS_KEY_ID"),
                        secret_access_key = Sys.getenv("COS_SECRET_ACCESS_KEY"),
                        endpoint = Sys.getenv("COS_ENDPOINT"))
# Salvando o modelo
board |> pins::pin_write(saved_lr_fit, name = "titanic-model")

Lendo o modelo do COS e fazendo a predição

model <- board |> pins::pin_read("titanic-model")
input <- data.frame(Sex = "male", Pclass = "3rd")
pred <- tidypredict::tidypredict_to_column(input, model)

pred
   Sex Pclass        fit
1 male    3rd 0.08197411

plumber:: criando api

# plumber.R

#* @apiTitle Prediction Survived in Titanic Disaster

#* @param sex Sex
#* @param pclass Class
#* @post /predict
function(sex, pclass) {
    
    board <- pins::board_s3(bucket = Sys.getenv("COS_BUCKET"),
                            region = Sys.getenv("COS_REGION"),
                            access_key = Sys.getenv("COS_ACCESS_KEY_ID"),
                            secret_access_key = Sys.getenv("COS_SECRET_ACCESS_KEY"),
                            endpoint = Sys.getenv("COS_ENDPOINT"))
    model <- board |> pins::pin_read("titanic-model")
    input <- data.frame(Sex = sex, Pclass = pclass)
    pred <- tidypredict::tidypredict_to_column(input, model)
    
    return(pred)
    
}

Executando a api

plumber::pr_run(plumber::pr("R/plumber.R"), host = "0.0.0.0", port = 8080)

httr:: fazendo requisições

"http://localhost:8080/predict" |> 
    httr::POST(body = list(sex = "female", pclass = "1st"), encode = "json") |>
    httr::content() |> 
    jsonlite::toJSON(pretty = TRUE,
                     auto_unbox = TRUE)
[
{
"Sex": "female",
"Pclass": "1st",
"fit": 0.9097
}
]
"http://localhost:8080/predict" |> 
    httr::POST(body = list(sex = "male", pclass = "3rd"), encode = "json") |>
    httr::content() |> 
    jsonlite::toJSON(pretty = TRUE,
                     auto_unbox = TRUE)
[
{
"Sex": "male",
"Pclass": "3rd",
"fit": 0.082
}
] 

quantmod:: preços de ações

dataStocks <- function(code, from, to) {
    
    stock <- 
        quantmod::getSymbols(code, 
                             src = "yahoo", 
                             from = from, 
                             to = to, 
                             auto.assign = FALSE)
    
    stock <-
        stock |>
        as.data.frame() |> 
        tibble::rownames_to_column(var = "Date") |> 
        dplyr::as_tibble() |> 
        dplyr::select(dplyr::matches("Date|Adjusted")) |> 
        dplyr::mutate(Date = Date |> lubridate::as_date()) |> 
        dplyr::rename(Adjusted = 2)
    
    return(stock)
    
}

shiny:: criando aplicações web

library(shiny)

ui <- fluidPage(
    h1("Olá Shiny!")
    
)

server <- function(input, output, session) {
    
}

shinyApp(ui, server)

shiny:: app para obter preços de ações

library(shiny)
library(shinythemes)

source("dataStocks.R")

start_value <- Sys.Date() - 31
end_value <- Sys.Date()

ui <- fluidPage(
    theme = shinytheme("sandstone"),
    navbarPage("STOCK PRICE"),
    sidebarLayout(
        sidebarPanel(
            textInput("code", "Stock code", value = "PETR4.SA"),
            dateRangeInput("date", "Date", 
                           start = start_value,
                           end = end_value)
        ),
        
        mainPanel(
            plotly::plotlyOutput("plot")
        )
    )
)

server <- function(input, output, session) {
    
    data_stock <- reactive({
        
        validate(
            need(input$code, "Please type a stock code!"),
            need(input$date[1], "Please choose a start date!"),
            need(input$date[2], "Please choose a end date!"))
        
        dataStocks(input$code, input$date[1], input$date[2])
    })
    
    output$plot <- plotly::renderPlotly({
        plotly::plot_ly(data_stock(), 
                        x = ~Date, 
                        y = ~Adjusted, 
                        color = "orange",
                        mode = "lines",
                        type = "scatter") |> 
            plotly::rangeslider()
    })
    
}

shinyApp(ui, server)

R com docker

Dockerfile

FROM openwhisk/dockerskeleton
ARG NOT_CRAN=true
ARG ARROW_R_DEV=true
RUN apk update && apk add R R-dev R-doc \
build-base libsodium-dev autoconf automake bash \ 
cmake g++ gcc make libxml2-dev
RUN R -e "install.packages(c('jsonlite', 'tidypredict', 'yaml', \ 
'pins', 'paws.storage'), repos = 'http://cran.rstudio.com/')"

Build

podman build -t docker.io/th1460/titanic-classifier .

Push para um registry

podman login -u <user> -p <password> docker.io/th1460/titanic-classifier
podman push docker.io/th1460/titanic-classifier

R e IBM Cloud Functions

O serviço IBM Cloud Functions é uma plataforma de computação orientada a eventos, também conhecida como computação sem servidor ou como Function-as-a-Service (FaaS), que executa código em resposta a eventos ou chamadas diretas.

exec

#!/bin/bash
# run R script
chmod +x script.R # turn executable
echo "$@" > input.json # set input
./script.R # run script

script.R

#!/usr/bin/env Rscript

readRenviron(".Renviron")

# load model
board <- pins::board_s3(bucket = Sys.getenv("COS_BUCKET"), 
region = Sys.getenv("COS_REGION"), 
access_key = Sys.getenv("COS_ACCESS_KEY_ID"),
secret_access_key = Sys.getenv("COS_SECRET_ACCESS_KEY"),
endpoint = Sys.getenv("COS_ENDPOINT"))

model <- pins::pin_read(board, "titanic-model")

# input
input <- jsonlite::fromJSON("input.json", flatten = FALSE)

# predict
pred <- tidypredict::tidypredict_to_column(as.data.frame(input), model)

# output
jsonlite::stream_out(pred, verbose = FALSE)

Deploy do modelo na IBM Cloud Functions

Login na IBM Cloud

ibmcloud login -sso
ibmcloud target --cf

Empacotar os arquivos para o deploy

zip -r titanic.zip exec script.R .Renviron

Deploy

ibmcloud fn action create titanic-classifier titanic.zip \ 
--docker th1460/titanic-classifier --web true

Descobrindo qual é a url

ibmcloud fn action get titanic-classifier --url

Fazendo uma requisição na function

results <- "<URL>.json" |> 
    httr::POST(body = list(Sex = "male", Pclass = "3rd"), 
               encode = "json") |> httr::content()

results[c("Sex", "Pclass", "fit")] |> 
    jsonlite::toJSON(pretty = TRUE,
                     auto_unbox = TRUE)
{
"Sex": "male",
"Pclass": "3rd",
"fit": 0.082
} 

rayshader:: mapas em 3D

# Load elevation data (https://th1460.github.io/posts/2022/08/mapa-3d-com-rayshader/)
elev_file <- "../data/rj.tif"
elev_img <- raster::raster(elev_file)
elev_matrix <- 
    matrix(raster::extract(elev_img, 
                           raster::extent(elev_img), 
                           buffer = 1000), 
           nrow = ncol(elev_img), 
           ncol = nrow(elev_img)
    )
# Define bounding box with longitude/latitude coordinates
bbox <- list(
    p2 = list(long = -43.2328217452992, lat = -22.99560928307949),
    p1 = list(long = -43.133008448808454, lat = -22.930329210944166)
)

leaflet::leaflet() |> 
    leaflet::addTiles() |> 
    leaflet::addRectangles(
        lng1 = bbox$p1$long, lat1 = bbox$p1$lat,
        lng2 = bbox$p2$long, lat2 = bbox$p2$lat,
        fillColor = "transparent"
    ) |> 
    leaflet::fitBounds(
        lng1 = bbox$p1$long, lat1 = bbox$p1$lat,
        lng2 = bbox$p2$long, lat2 = bbox$p2$lat,
    )

rayshader:: mapas em 3D

source("../src/define_image_size.R")
source("../src/get_arcgis_map_image.R")

image_size <- define_image_size(bbox, major_dim = 600)

# Fetch overlay image
overlay_file <- "../data/rj_overlay.png"

get_arcgis_map_image(bbox, map_type = "World_Imagery", file = overlay_file,
                     width = image_size$width, height = image_size$height, 
                     sr_bbox = 4326)

overlay_img <- png::readPNG(overlay_file)

# Calculate rayshader layers
watermap <- rayshader::detect_water(elev_matrix)

# Plot 2D with texture
elev_matrix |> 
    rayshader::sphere_shade(texture = "imhof4") |> 
    rayshader::add_water(watermap, color = "imhof4") |> 
    rayshader::add_overlay(overlay_img, alphalayer = 0.5) |> 
    rayshader::plot_map()

rayshader:: mapas em 3D

# Plot 3D
zscale <- 30
rgl::clear3d()
elev_matrix |> 
    rayshader::sphere_shade(texture = "imhof4") |> 
    rayshader::add_water(watermap, color = "imhof4") |> 
    rayshader::add_overlay(overlay_img, alphalayer = .8) |> 
    plot_3d(elev_matrix, zscale = zscale, windowsize = c(1500, 1200),
            water = TRUE, soliddepth = -max(elev_matrix)/zscale, wateralpha = 1,
            theta = 25, phi = 30, zoom = 0.65, fov = 60)
rayshader::render_snapshot("../data/rj-3d.png")

purrr:: programação funcional

Fazendo loop com for

x <- NULL
for (i in 1:10) {
    x <- c(x, i^2)
}
x
 [1]   1   4   9  16  25  36  49  64  81 100

Fazendo loop com map_dbl

1:10 |> 
    purrr::map_dbl(\(x) x^2)
 [1]   1   4   9  16  25  36  49  64  81 100

keras::dataset_imdb() mineração de texto

Estruturando a base de dados

imdb <- keras::dataset_imdb(num_words = 10000)

set.seed(1984)
train_labels <- 
    dplyr::tibble(y = imdb$train$y, id = 1:25000) |>  
    dplyr::group_by(y) |> 
    dplyr::sample_n(size = 5000) |> 
    dplyr::ungroup()

train_data <- 
    train_labels |> 
    dplyr::pull(id) |> 
    purrr::map(~ imdb$train$x[[.x]])

keras::dataset_imdb_word_index() mineração de texto

Decodificando o review

word_index <- keras::dataset_imdb_word_index()
reverse_word_index <- names(word_index)
names(reverse_word_index) <- word_index

decoded_review <- function(x) {sapply(train_data[[x]], function(index) {
    word <- if (index >= 3) reverse_word_index[[as.character(index - 3)]]
    if (!is.null(word)) word else "?"
})}

data_decoded_review <- 
    seq(10000) |> 
    purrr::map(~ decoded_review(.x))

data_decoded_review <-
    tibble::enframe(data_decoded_review,
                    name = "index",
                    value = "words")

data_decoded_review <-
    data_decoded_review |> 
    dplyr::bind_cols(train_labels)

data_decoded_review |> 
    saveRDS("../data/data_decoded_review.RDS")

tidytext:: e wordcloud::

data_decoded_review <- readRDS("../data/data_decoded_review.RDS")

set.seed(1984)
data_decoded_review |> 
    tidyr::unnest(words) |> 
    dplyr::anti_join(tidytext::stop_words, by = c("words" = "word")) |> 
    dplyr::filter(! words %in% c("?", "br", "1", "2", 
                                 "3", "4", "7", "8", 
                                 "9", "10")) |> 
    dplyr::count(words, y, sort = TRUE) |> 
    dplyr::mutate(y = ifelse(y == 0, "Negativo", "Positivo")) |> 
    reshape2::acast(words ~ y, value.var = "n", fill = 0) |> 
    wordcloud::comparison.cloud(colors = c("#E24664", "#1D6CE6"),
                                max.words = 200,
                                title.size = 4,
                                scale = c(5, 2))

tensorflow:: deep learning

Criando um modelo para avaliar se um review de filme é positivo ou negativo

imdb <- keras::dataset_imdb(num_words = 10000)

set.seed(1984)
train_labels <- 
    dplyr::tibble(y = imdb$train$y, id = 1:25000) |>  
    dplyr::group_by(y) |> 
    dplyr::sample_n(size = 5000) |> 
    dplyr::ungroup()

train_data <- 
    train_labels |> 
    dplyr::pull(id) |> 
    purrr::map(~ imdb$train$x[[.x]])

saveRDS(list(data = train_data, 
             labels = train_labels), "../data/train_imdb.RDS")

test_labels <- 
    dplyr::tibble(y = imdb$test$y, id = 1:25000) |>  
    dplyr::group_by(y) |> 
    dplyr::sample_n(size = 5000) |> 
    dplyr::ungroup()

test_data <- 
    test_labels |> 
    dplyr::pull(id) |> 
    purrr::map(~ imdb$test$x[[.x]])

saveRDS(list(data = test_data, 
             labels = test_labels), "../data/test_imdb.RDS")

tensorflow:: deep learning

Função para Tranformando os reviews (inputs) em um vetor de 0s e 1s

vectorize_sequences <- function(sequences, dimension = 10000) {
    # Creates an all-zero matrix of shape (length(sequences), dimension)
    results <- matrix(0, nrow = length(sequences), ncol = dimension) 
    for (i in 1:length(sequences))
        # Sets specific indices of results[i] to 1s
        results[i, sequences[[i]]] <- 1 
    results
}
train_imdb <- readRDS("../data/train_imdb.RDS")
test_imdb <- readRDS("../data/test_imdb.RDS")

# tranformando os reviews (inputs) em um vetor de 0s e 1s
x_train <- vectorize_sequences(train_imdb$data)
x_test <- vectorize_sequences(test_imdb$data)
# transformando os outpus em numérico
y_train <- as.numeric(train_imdb$labels |> pull(y))
y_test <- as.numeric(train_imdb$labels |> pull(y))

tensorflow:: deep learning

Estrutura da rede

DiagrammeR::grViz("resources/images/mlp.gv")

tensorflow:: deep learning

Construindo a rede

model <- keras::keras_model_sequential() |> 
    keras::layer_dense(units = 10, activation = "relu", input_shape = c(10000)) |> 
    keras::layer_dense(units = 10, activation = "relu") |> 
    keras::layer_dense(units = 1, activation = "sigmoid")

Treinamento

model |> keras::compile(
    optimizer = "rmsprop",
    loss = "binary_crossentropy",
    metrics = c("accuracy")
)
model |> keras::fit(x_train, y_train, epochs = 4, batch_size = 512)

Avaliando o modelo

(results <- model |> keras::evaluate(x_test, y_test))
    loss accuracy 
0.350753 0.870200 

Salvando o modelo

model |> keras::export_savedmodel("../data/savedmodel")

tensorflow:: deep learning

Classificando um review

# codificação
coded_review <- function(review){
    
    word_index <- keras::dataset_imdb_word_index()
    reverse_word_index <- unlist(word_index)
    
    words <-
        (review |> 
             tolower() |> 
             strsplit(split = " "))[[1]] |> 
        gsub("[!,:]", "", x = _)
    
    index <- reverse_word_index[names(reverse_word_index) %in% words]
    
    return(list(as.numeric(index) + 3))
    
}

(exemplo1 <- 
  "The film is best, great, beautiful and wonderful!" |>  
  coded_review()) |> str()
List of 1
 $ : num [1:8] 118 307 9 87 4 5 389 22
# vetorização
(exemplo_vector1 <- vectorize_sequences(exemplo1)) |> str()
 num [1, 1:10000] 0 0 0 1 1 0 0 0 1 0 ...
# classificação
exemplo_vector1 |> 
  tfdeploy::predict_savedmodel("../data/savedmodel")
$dense
[1] 0.8616343

Problema de otimização de rota de frota

Mapa do centro de Campinas SP

bbox <- list(
    p1 = list(long = -47.0708, lat = -22.9075),
    p2 = list(long = -47.0348, lat = -22.8804)
)

leaflet::leaflet() |> 
    leaflet::addTiles()|> 
    leaflet::addRectangles(
        lng1 = bbox$p1$long, lat1 = bbox$p1$lat,
        lng2 = bbox$p2$long, lat2 = bbox$p2$lat,
        fillColor = "transparent"
    ) |> 
    leaflet::fitBounds(
        lng1 = bbox$p1$long, lat1 = bbox$p1$lat,
        lng2 = bbox$p2$long, lat2 = bbox$p2$lat,
    )

Vias como linhas baseado no osmdata

if (!exists("has_internet_via_proxy")) {
    assign("has_internet_via_proxy", TRUE, environment(curl::has_internet))   
}

# Consulta
query <- osmdata::opq(bbox =  unlist(bbox)) |> 
    osmdata::add_osm_feature(key = "highway") |> 
    osmdata::osmdata_sf()

# Retornando linhas
query_lines <- query$osm_lines

# Visualizando o objeto
query_lines |> 
    sf::st_geometry() |> 
    ggplot2::ggplot() +
    ggplot2::geom_sf(colour = "gray") +
    ggplot2::theme_minimal() +
    ggplot2::theme(text = element_text(size = 15))

Convertendo as vias em um objeto sfnetwork

poly_to_lines <- sf::st_cast(query$osm_polygons, "LINESTRING")

query_lines <- dplyr::bind_rows(query_lines, poly_to_lines)

highway_net <- sfnetworks::as_sfnetwork(query_lines, directed = FALSE)

# Remove nodes that have only two edges connected
highway_simple <- tidygraph::convert(highway_net, sfnetworks::to_spatial_smooth)

# Filter connected components
connected_net <- highway_simple |> 
    tidygraph::activate("nodes") |> 
    dplyr::filter(tidygraph::group_components() == 1)

# Plot
ggplot2::ggplot() +
    ggplot2::geom_sf(data = sf::st_as_sf(connected_net, "edges"), col = "gray") +
    ggplot2::geom_sf(data = sf::st_as_sf(connected_net, "nodes"), col = "gray") +
    ggplot2::theme_minimal() +
    ggplot2::theme(text = element_text(size = 15))

Problema de otimização de rota de frota

São 3 veículos e preciso selecionar a melhor rota. Eles devem sair e retornar a um depósito.

set.seed(711)
index_visits <- sample(1:603, 9)
visits <- connected_net |> 
    tidygraph::activate("nodes") |> 
    dplyr::slice(index_visits)

depot <- connected_net |> 
    tidygraph::activate("nodes") |> 
    dplyr::slice(23)

ggplot2::ggplot() + 
    ggplot2::geom_sf(data = sf::st_as_sf(connected_net, "edges"), col = "gray") +
    ggplot2::geom_sf(data = sf::st_as_sf(connected_net, "nodes"), col = "gray") +
    ggplot2::theme_minimal() +
    ggplot2::theme(text = element_text(size = 15)) +
    ggplot2::geom_sf(data = sf::st_as_sf(visits, "nodes"), 
                     ggplot2::aes(col = "Visits"), size = 6) +
    ggplot2::geom_sf(data = sf::st_as_sf(depot, "nodes"), 
                     ggplot2::aes(col = "Depot"), size = 6) +
    ggplot2::labs(x = "", y = "", colour = "") +
    ggplot2::scale_color_manual(values = c("Visits" = "blue", 
                                           "Depot" = "orange"))

Problema de otimização de rota de frota

Cálculo da matriz de custo

connected_net <- 
    connected_net |> 
    tidygraph::activate("edges") |> 
    dplyr::mutate(weight = sfnetworks::edge_length())

cost_matrix_route_problem <- 
    sfnetworks::st_network_cost(connected_net, 
                                from = c(23, index_visits), 
                                to = c(23, index_visits), 
                                weights = "weight")

mode(cost_matrix_route_problem) <- "integer"

cost_matrix_route_problem
Units: [m]
      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
 [1,]    0 3016 3476  308 3118 3307 4848 1631 3902  4196
 [2,] 3016    0 3335 2708 2424 4025 6296 2764 1059  5639
 [3,] 3476 3335    0 3167 4183 4899 4506 2152 4175  4939
 [4,]  308 2708 3167    0 3187 2999 4540 1323 3593  3888
 [5,] 3118 2424 4183 3187    0 3165 5855 3818 3264  5128
 [6,] 3307 4025 4899 2999 3165    0 4941 3055 4910  5375
 [7,] 4848 6296 4506 4540 5855 4941    0 5326 7150  2709
 [8,] 1631 2764 2152 1323 3818 3055 5326    0 3649  4844
 [9,] 3902 1059 4175 3593 3264 4910 7150 3649    0  6479
[10,] 4196 5639 4939 3888 5128 5375 2709 4844 6479     0

Problema de otimização de rota de frota utilizando ortools e reticulate::

reticulate::use_virtualenv("../venv", required = TRUE)
constraint_solver <- reticulate::import("ortools.constraint_solver")

# Create data
create_data_model <- function() {
    
    data <- reticulate::dict()
    data['distance_matrix'] <- cost_matrix_route_problem
    data['distance_matrix'] <-  data['distance_matrix']$tolist()
    data['num_vehicles'] <- 3L
    data['depot'] <- 0L
    
    return(data)
    
}

# Create the routing model
data <- create_data_model()
manager <- constraint_solver$pywrapcp$RoutingIndexManager(
    length(data['distance_matrix']),
    data['num_vehicles'],
    data['depot']
)

routing <- constraint_solver$pywrapcp$RoutingModel(manager)

# Create the distance callback
distance_callback <- function(from_index, to_index) {
    
    from_node <- manager$IndexToNode(from_index)
    to_node <- manager$IndexToNode(to_index)
    
    return(data['distance_matrix'][from_node][to_node])
    
}

transit_callback_index <- routing$RegisterTransitCallback(distance_callback)

# Set the cost of travel
routing$SetArcCostEvaluatorOfAllVehicles(transit_callback_index)

# Add a distance dimension
dimension_name <- 'Distance'
routing$AddDimension(
    transit_callback_index,
    0L,  # no slack
    15000L,  # vehicle maximum travel distance
    TRUE,  # start cumul to zero
    dimension_name)

distance_dimension <- routing$GetDimensionOrDie(dimension_name)
distance_dimension$SetGlobalSpanCostCoefficient(100L)

# Set search parameters
search_parameters <- constraint_solver$pywrapcp$DefaultRoutingSearchParameters()
search_parameters$first_solution_strategy = (
    constraint_solver$routing_enums_pb2$FirstSolutionStrategy$PATH_CHEAPEST_ARC)

solution <- routing$SolveWithParameters(search_parameters)

# Save routes to a list or array
get_routes <- function(solution, routing, manager) {
    
    routes <- list()
    
    for (route_nbr in seq(0, routing$vehicles() - 1)) {
        
        index <- routing$Start(route_nbr)
        route <- c(manager$IndexToNode(index))
        
        while(!routing$IsEnd(index)) {
            
            index <- solution$Value(routing$NextVar(index))
            route <- append(route, manager$IndexToNode(index))
            
        }
        
        routes[[route_nbr + 1]] <- route
        names(routes)[route_nbr + 1] <- route_nbr
        
    }
    
    return(routes)
    
}

(routes <- get_routes(solution, routing, manager))
$`0`
[1] 0 6 9 0

$`1`
[1] 0 3 2 5 0

$`2`
[1] 0 7 1 8 4 0

Problema de otimização de rota de frota

real_routes <- lapply(readRDS("../data/routes.RDS"), \(x) c(23, index_visits)[x + 1])

vrp_paths <- function(x) {
    
    results <- list()
    for (i in seq(length(x))) {
        
        from_idxs <- unique(x[[i]])
        to_idxs <- c(from_idxs[2:length(from_idxs)], from_idxs[1])
        
        paths <- mapply(sfnetworks::st_network_paths,
                        from = from_idxs,
                        to = to_idxs,
                        MoreArgs = list(x = connected_net, weights = "weight"))    
        
        results[[i]] <- paths
    }
    
    return(results)
    
}

# Plot results
vrp_paths_fun <- function(x, index, vehicle) {
    
    vrp_edge_paths <- connected_net |> 
        tidygraph::activate("edges") |> 
        dplyr::slice(vrp_paths(x)[[vehicle]][, index]$edge_paths |> unlist())
    
    ggplot2::geom_sf(data = sf::st_as_sf(vrp_edge_paths, "edges"),
                     ggplot2::aes(col = as.character(vehicle)), lwd = 1.3)
    
}

p <- ggplot2::ggplot() +
    ggplot2::geom_sf(data = sf::st_as_sf(connected_net, "edges"), col = "gray") +
    ggplot2::geom_sf(data = sf::st_as_sf(connected_net, "nodes"), col = "gray") +
    ggplot2::theme_minimal()

for (j in seq(length(real_routes))) {
    for (i in seq(length(real_routes[[j]]) - 1)) {
        p <- p + vrp_paths_fun(real_routes, i, j)   
    }
}

p <- p + ggplot2::geom_sf(data = sf::st_as_sf(visits, "nodes"), 
                     ggplot2::aes(col = "Visits"), size = 6) +
    ggplot2::geom_sf(data = sf::st_as_sf(depot, "nodes"), 
                     ggplot2::aes(col = "Depot"), size = 6) +
    ggplot2::labs(x = "", y = "", colour = "") +
    ggplot2::theme(text = element_text(size = 15))

plotly::ggplotly(p, tooltip = "") |> 
    plotly::layout(modebar = list(orientation = "v"))

Linguagem R testada com outros recursos da IBM Cloud

  • Kubernetes/Openshift
  • Code Engine
  • Cloud Pak for Data
  • App ID

Projetos pessoais com R

Obrigado

Contato

th1460.github.io
github.com/th1460
medium.com/@thopr
linkedin.com/in/thop

slack: @thop