Scrape em tabelas
Exercícios
Diretrizes gerais:
- Instale o pacote
tabulizer
devtools::install_github(c("ropensci/tabulizerjars", "ropensci/tabulizer"))
## Skipping install of 'tabulizerjars' from a github remote, the SHA1 (d1924e01) has not changed since last install.
## Use `force = TRUE` to force installation
## Skipping install of 'tabulizer' from a github remote, the SHA1 (08e3d763) has not changed since last install.
## Use `force = TRUE` to force installation
- Baixe o arquivo .Rmd e o folder com os arquivos em .pdf e abra no RStudio.
Siga as diretrizes da atividade.
Rode o arquivo .Rmd por meio do ícone
knitr
Salve o .Rmd e submeta-o por meio do email renataoliveira@gmail.com.
library(hrbrthemes)
library(ggplot2)
library(Cairo)
library(extrafont)
library(rJava) # Needed for tabulizer
library(tabulizer) # Handy tool for PDF Scraping
library(tidyverse) # Core data manipulation and visualization libraries
library(janitor)
extrafont::loadfonts()
Este chunk serve para fazer a leitura dos arquivos
# PDF Scrape Tables
#Store folder of pdf to be loaded
folder <- "pdf/"
# Number of files in folder
num_files <- length(dir(folder, pattern="*.pdf"))
# Generate file list of shapefiles
files <- as.data.frame(dir(folder, pattern="*.pdf"))
names(files) <- "data"
## Generate static maps
tmp <- data.frame(bairro = NA , sg = NA, srag = NA, obitos = NA, data = NA)
for (i in 1:num_files) {
file_name <- files[i,1]
data <- print(str_sub(files[i,1], -14, -7))
scrape <- tabulizer::extract_tables(file = paste0("pdf/", files[i,1]), output = "data.frame", page = 2, guess = TRUE, method = "stream")
scrape1 <- as.data.frame(scrape[[1]])
scrape1 <- scrape1 %>%
separate(col = 'Síndrome.Respiratória.Aguda.Grave', into = c("total de casos", "óbitos"), sep = ' ') %>%
slice(-1)
scrape1 <- scrape1[,c(1,3,5,6)]
names(scrape1) <- c("bairro", "sg", "srag", "obitos")
if (length(scrape) != 1) {
scrape2 <- as.data.frame(scrape[[2]])
scrape2[5,] <- colnames(scrape2)
names(scrape2) <- c("bairro", "sg", "srag", "obitos")
scrape2 <- scrape2 %>%
filter(bairro != "TOTAL")
scrape2 <- scrape2 %>%
mutate(bairro = str_replace_all(bairro, "\\.", " "), sg = str_replace(sg, "X", ""), srag = str_replace(srag, "X", ""), obitos = str_replace(obitos, "X", ""))
tmp_dia <- rbind(scrape1, scrape2)
tmp_dia[,5] <- data
names(tmp_dia) <- c("bairro", "sg", "srag", "obitos", "data")
} else {
tmp_dia <- scrape1
tmp_dia[,5] <- data
names(tmp_dia) <- c("bairro", "sg", "srag", "obitos", "data")
}
tmp <- tmp %>%
bind_rows(tmp, tmp_dia) %>%
filter(!is.na(sg))
}