# Skript scrapping.R slouzi k automatickemu stahovani dat z webovych stranek drugbank.ca o lecivech uzivanych pri lecbe schizofrenie. 
# Vysledkem jsou 3 tabulky v Excel - informace o stazenych lecivech, jejich interakce a cile.

#install.packages("rvest") - instalace balicku rvest pro stahovani dat z webu
#install.packages("DescTools") - instalace balicku DescTools pro prevod do Excel

rm(list=ls())  # smazani celeho environment
library(rvest) # nacteni knihovny pro scrapping

# stazeni Accession Numbers jednotlivych leku pro pozdejsi stahovani informaci techto jednotlivych leku na zaklade jejich AC
a <- 1 # pomocna promenna
DB <- c() # nadefinovani vektoru, kam se budou v cyklu ukladat jednotlive AC
odkaz_all_drugs <- c('https://www.drugbank.ca/unearth/q?utf8=%E2%9C%93&query=schizophrenia&searcher=drugs&approved=1&vet_approved=1&nutraceutical=1&illicit=1&withdrawn=1&investigational=1&button=','https://www.drugbank.ca/unearth/q?approved=1&button=&c=_score&d=down&illicit=1&investigational=1&nutraceutical=1&page=2&query=schizophrenia&searcher=drugs&vet_approved=1&withdrawn=1','https://www.drugbank.ca/unearth/q?approved=1&button=&c=_score&d=down&illicit=1&investigational=1&nutraceutical=1&page=3&query=schizophrenia&searcher=drugs&vet_approved=1&withdrawn=1')

for (i in 1:length(odkaz_all_drugs)){ # postupne stahovani AC z celkem 4 webovych stranek (leku je 80 rozdelenych do 4 stran) 
all_drugs <- read_html(odkaz_all_drugs[i])
DB[a:(a+24)] <- all_drugs %>% 
   html_nodes(".btn-card")%>%
   html_text()
a = a+25}
all_drugs4 <- read_html('https://www.drugbank.ca/unearth/q?approved=1&button=&c=_score&d=down&illicit=1&investigational=1&nutraceutical=1&page=4&query=schizophrenia&searcher=drugs&vet_approved=1&withdrawn=1')
DB4 <- all_drugs4 %>% 
  html_nodes(".btn-card")%>%
  html_text() 
pom <- length(DB4) # ulozeni posledni stranky leku
DB[76:(75+pom)] <- DB4

# vytvoreni pomocnych promennych
Interactions <- character()
Targets <- character()
pomocna <- 1
Drugs <- matrix(0,length(DB),3)
odkaz <- "https://www.drugbank.ca/drugs/"

# postupne stahovani dat jednotlivych leku
for (k in 1:length(DB)){ 
Drug <- read_html(paste0(odkaz,DB[k])) #stazeni dat z potrebne www

ATC <- Drug %>% # ulozeni promenne ATC
  html_node(css="tr:nth-child(50) > td > ul > li:nth-child(1) > a") %>% 
  html_text()

Int <- Drug %>% # stazeni dat pro vyhodnoceni podminky IF
  html_node(css="table#drug-interactions")
Targ <- Drug %>% 
  html_node(css="table#moa-target-table")
if (is.na(Int) == FALSE & is.na(Targ) == FALSE & is.na(ATC) == FALSE){  # ulozi cile a interakce jenom kdyz existuje tabulka s interakcemi, cili a ATC data
  Name <- Drug %>% # ulozeni nazvu leku
    html_node("td, strong")%>%
    html_text()
  Group <- Drug %>% # ulozeni skupiny, do ktere lek patri
    html_node("tr:nth-child(5) td")%>%
    html_text()
  Targ <- Drug %>% # ulozeni tabulky s cili
    html_node(css="table#moa-target-table") %>%
    html_table()
    Targ[,ncol(Targ)+1] <- Name # pridani nazvu leku do posledniho sloupce
    Targets <- rbind(Targets, Targ)
  Int <- Drug %>% # ulozeni interagujicich leku
    html_node(css="table#drug-interactions") %>%
    html_table()
    Int[,ncol(Int)+1] <- Name # pridani nazvu leku do posledniho sloupce
    Interactions <- rbind(Interactions, Int)

Drugs[pomocna,1] <- Name # zapsani informaci o lecich do tabulky Drugs
Drugs[pomocna,2] <- Group
Drugs[pomocna,3] <- ATC
pomocna <- pomocna+1 # umozni zapisovat novy lek vzdy na dalsi radek v tabulce Drugs
}}

names(Interactions) <- c("Interacting drug","Interaction","Group", "Drug") # prejmenovani nazvu sloupcu v tabulkach
names(Drugs) <- c("Name","Group", "ATC")
names(Targets)[names(Targets)=="V8"] <- "Drug"

Drugs <- Drugs[1:(pomocna-1),] # odstraneni prebytecnych radku v tabulce (muselo byt nadefinovano tolik radku, kolik je celkem leku, protoze predem nevime, kolik z nich bude stazeno - kolik ma hodnot v Targets a ATC)

library(DescTools) #nacteni knihovny pro export do Excelu a prevedeni dat do Excelu
XLView(Drugs)
XLView(Interactions) 
XLView(Targets)