The goal is to download thousands of websites Content for statistical analysis. Therefore, the solution should be expanded.
library(RCurl)
library(httr)
uris = c("inforapido.com.ar", "lm.facebook.com", "promoswap.enterfactory.com",
"p.brilig.com", "wap.renxo.com", "alamaula .com", "syndication.exoclick.com",
"mcp-latam.zed.com", "startappexchange.com", "fonts.googleapis.com",
"xnxx.com", "wv.inner-active.mobi", "canchallena.lanacion.com.ar",
"android.ole.com.ar", "livefyre.com", "fbapp://256002347743983/thread")
### RCurl Concurrent requests
getURIs <- function(uris, ..., multiHandle = getCurlMultiHandle(), .perform = TRUE){
content = list()
curls = list()
for(i in uris) {
curl = getCurlHandle()
content[[i]] = basicTextGatherer()
opts = curlOptions(URL = i, writefunction = content[[i]]$update,
timeout = 2, maxredirs = 3, verbose = TRUE,
followLocation = TRUE,...)
curlSetOpt(.opts = opts, curl = curl)
multiHandle = push(multiHandle, curl)
}
if(.perform) {
complete(multiHandle)
lapply(content, function(x) x$value())
} else {
return (list(multiHandle = multiHandle, content = content))
}
}
### Split uris in 3
uris_ls = split(uris, 1:3 )
### retrieve content
uris_content <- list()
for(i in seq_along(uris_ls)){
uris_content[[i]] <- getURIs(uris_ls[[i]])
}
library(plyr)
a = lapply(uris_content, function(x) ldply(x, rbind))
result = ldply(a, rbind)
names(result) <- c('url','content')
result$number_char <- nchar(as.character(result$content))< br />
### Here are examples of url that aren't working
url_not_working = result[result$number_char == 0, 1]
# url_not_working
# [1] "inforapido.com. ar" "canchallena.lanacion.com.ar" "fbapp://256002347743983/thread"
# [4] "xnxx.com" "startappexchange.com" "wv.inner-active.mobi"
# [7] "livefyre.com"
### Using httr GET it works fine
get_httr = GET(url_not_working[2])
content( g,'text')
# The result is the same when using a single call
get_rcurl = getURL(url_not_working[2], encoding='UTF-8', timeout = 2,
maxredirs = 3, verbose = TRUE,
followLocation = TRUE)
get_rcurl
Question:
Given the number of web pages I need to crawl, I would rather use RCurl because it supports concurrent requests.
I want to know if the getURLs() call can be improved to make it work
as the GET() version in case the getURL/getURLs version fails.
Update:
I added a point with more data (990 uris) to better reproduce the problem.
uris_ls < -dput() # dput() output found here: https://gist.github.com/martinbel/b4cc730b32914475ef0b
After running:
uris_content <- list()
for(i in seq_along(uris_ls)){
uris_content [[i]] <- getURIs(uris_ls[[i]])
}
I received the following error:
Error in curlMultiPerform(obj): embedded nul in string:'GIF89a\001'
In addition: Warning message:
In strsplit(str, "\\\r\\\n"): input string 1 is invalid in this locale
Use getURIAsynchronous:
uris_content <- list()
for(i in seq_along(uris_ls)){
uris_content[[i]] <- getURIAsynchronous(uris_ls[[i]],
.opts=list(timeout = 2, maxredirs = 3, verbose = TRUE,
followLocation = TRUE))< br /> }
I got a similar error:
nchar(str) error: invalid multibyte string 1
update 2
library(RCurl)
uris_ls <- dput() # dput() output found here: https://gist.github.com/martinbel/b4cc730b32914475ef0b
After trying the following:
Sys.setlocale(locale="C")
uris_content <- list()
for(i in seq_along(uris_ls) ){
uris_content[[i]] <- getURIAsynchronous(uris_ls[[i]],
.opts=list(timeout = 2, maxredirs = 3, verbos e = TRUE,
followLocation = TRUE))
)
The result is that it works for the first 225 URLs, and then it only returns cero content from the website. This is a null error problem NS?
# This is a quick way to inspect the output:
nc = lapply(uris_content, nchar)
nc[[5]]
[1] 51422 0 16 19165 111763 6 14041 202 2485 0
[11] 78538 0 0 0 133253 42978 0 0 7880 33336
[21] 6762 194 93 0 0 0 0 0 9 0
[31] 165974 13222 22605 1392 0 42932 1421 0 0 0
[41] 0 13760 289 0 2674
nc[[6]]
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[39] 0 0 0 0 0 0 0
library(RCurl)
library(httr)
< br />Sys.setlocale(locale="C")
opts = list(timeout = 2, maxredirs = 3,
verbose = TRUE, followLocation = TRUE)
< br />try_asynch <- function( uris, .opts=opts){
getURIAsynchronous(uris, .opts=opts)
}
get_content <- function(uris){
cont <- try_asynch (uris)
nc <- lapply(content, nchar)
nc <- sapply(nc, function(x) ifelse(sum(x> 0), 1, 0))
if (sum(nc) <10){
r <- lapply(uris, function(x) GET(x))
cont <- lapply(r, function(x) content(x,'text '))
}
cont
}
docs <- lapply(uris_ls, get_content)
The following is a script to reproduce the problem I encountered when building a crawler with RCurl that performs concurrent requests.
The goal is to download the content of thousands of websites for statistical analysis. Therefore, the solution should be Extension.
library(RCurl)
library(httr)
uris = c("inforapido.com.ar" , "lm.facebook.com", "promoswap.enterfactory.com",
"p.brilig.com", "wap.renxo.com", "alamaula.com", "syndication.exoclick.com" ,
"mcp-latam.zed.com", "startappexchange.com", "fonts.googleapis.com",
"xnxx.com", "wv.inner-active.mobi", " canchallena.lanacion.com.ar",
"android.ole.com.ar", "livefyre.c om", "fbapp://256002347743983/thread")
### RCurl Concurrent requests
getURIs <- function(uris, ..., multiHandle = getCurlMultiHandle( ), .perform = TRUE){
content = list()
curls = list()
for(i in uris) {
curl = getCurlHandle()
content[[i]] = basicTextGatherer()
opts = curlOptions(URL = i, writefunction = content[[i]]$update,
timeout = 2, maxredirs = 3, verbose = TRUE,< br /> followLocation = TRUE,...)
curlSetOpt(.opts = opts, curl = curl)
multiHandle = push(multiHandle, curl)
}
if(. perform) {
complete(multiHandle)
lapply(content, function(x) x$value())
} else {
return(list(multiHandle = multiHandle, content = content))
}
}
### Split uris in 3
uris_ls = split(uris, 1:3)
### retrieve content
uris_content <- list()
for(i in seq_along(uris_ls)){
uris_c ontent[[i]] <- getURIs(uris_ls[[i]])
}
library(plyr)
a = lapply(uris_content, function(x) ldply( x, rbind))
result = ldply(a, rbind)
names(result) <- c('url','content')
result$number_char <- nchar(as. character(result$content))
### Here are examples of url that aren't working
url_not_working = result[result$number_char == 0, 1]
< br /># url_not_working
# [1] "inforapido.com.ar" "canchallena.lanacion.com.ar" "fbapp://256002347743983/thread"
# [4] "xnxx.com ""startappexchange.com" "wv.inner-active.mobi"
# [7] "livefyre.com"
### Using httr GET it works fine
< br />get_httr = GET(url_not_working[2])
content(g,'text')
# The result is the same when using a single call
get_rcurl = getURL (url_not_working[2], encoding='UTF-8', timeout = 2,
maxredirs = 3, verbose = TRUE,
followLocat ion = TRUE)
get_rcurl
Question:
Given the number of pages I need to crawl, I would rather use RCurl because it supports concurrent requests.
I think Know if the getURLs() call can be improved to make it work
As a GET() version in case the getURL/getURLs version fails.
Update:
I added one Points of more data (990 uris) to better reproduce the problem.
uris_ls <- dput() # dput() output found here: https:// gist.github.com/martinbel/b4cc730b32914475ef0b
After running:
uris_content <- list()
for(i in seq_along(uris_ls )){
uris_content[[i]] <- getURIs(uris_ls[[i]])
}
I received the following error:
< /p>
Error in curlMultiPerform(obj): embedded nul in string:'GIF89a\001'
In addition: Warning message:
In strsplit(str, "\\\r\\\ n"): input string 1 is invalid in this locale
Use getURIAsynchronous:
uris_content <- list()
for(i in seq_along(uris_ls)){
uris_content[[i]] <- getURIAsynchronous(uris_ls[[i]],
.opts=list(timeout = 2, maxredirs = 3, verbose = TRUE,
followLocation = TRUE))
}
I get a similar error:
n Char(str) error: invalid multibyte string 1
Update 2
library(RCurl)
uris_ls <- dput( ) # dput() output found here: https://gist.github.com/martinbel/b4cc730b32914475ef0b
After trying the following:
Sys.setlocale (locale="C")
uris_content <- list()
for(i in seq_along(uris_ls)){
uris_content[[i]] <- getURIAsynchronous(uris_ls[[i] ],
.opts=list(timeout = 2, maxredirs = 3, verbose = TRUE,
followLocation = TRUE))
}
The result is that it applies to the previous 225 URLs, and then it only returns cero content from the website. Is this a null error problem?
# This is a quick way to inspect the output:
nc = lapply(uris_content, nchar)
nc[[5]]
[1] 51422 0 16 19165 111763 6 14041 202 2485 0
[11] 78538 0 0 0 133253 42978 0 0 7880 33336
[21] 6762 194 93 0 0 0 0 0 9 0
[31] 165974 13222 22605 1392 0 42932 1421 0 0 0
[41] 0 13760 289 0 2674
nc[[6]]
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[39] 0 0 0 0 0 0 0
No one answered, I suggest a temporary solution. If getURIAsynchronous does not work, just use httr::GET and httr:: The content is downloaded in order.
library(RCurl)
library(httr)
Sys.setlocale(locale="C" )
opts = list(timeout = 2, maxredirs = 3,
verbose = TRUE, followLocation = TRUE)
try_asynch <- function(uris, .opts =opts){
getURIAsynchronous(uris, .opt s=opts)
}
get_content <- function(uris){
cont <- try_asynch(uris)
nc <- lapply(content, nchar)< br /> nc <- sapply(nc, function(x) ifelse(sum(x> 0), 1, 0))
if(sum(nc) <10){
r <- lapply (uris, function(x) GET(x))
cont <- lapply(r, function(x) content(x,'text'))
}
cont
}
docs <- lapply(uris_ls, get_content)