Simple use of PHPSPIDER acquisition this blog article content

Collection process

Acquire page content according to the link (curl)->Get the content that needs to be collected (can be filtered by regular, xpath, css selector, etc.)

< div class="code">

php


require_once ‘phpspider/autoloader.php‘;
use phpspider\core\phpspider;
use phpspider\core\requests;
use phpspider\core\selector;
/* Do NOT delete this comment */
/* Do not delete this comment*/
requests
::$input_encoding = ‘GB2312‘;
requests
::$output_encoding = ‘GB2312‘;
//Get the article url of the blog article list
for($i=1;$i<=10;$i++){
$url = "https://www.cnblogs.com/jcydd/default.html?page=".$i;
$html = requests::get($url );
//var_dump($html);
$selector = "//";
$result[] = selector::select($html, $selector,'regex');
}
//var_dump($result);

//According to the url loop to get the article title and content

foreach($result as $k=> $v){
foreach($v as $kk=>$vv< span style="color: #000000;">){

$html1 = requests::get($vv );
//var_dump($html1);

//To get the article title, add @ before and after the regular expression, I don’t know why

$selector1 = "@(.*)< /a>@";
$result1 = selector::select($html1 , $selector1,'regex');
//var_dump($result1);

//There is a problem with the regularity of the article content, and some content cannot be obtained

$selector2 = "@((.| \n)*)@";
$result2 = selector::select($html1 , $selector2,'regex');
//var_dump($result2);
//Remove the html tags in the content of the article

$result2=preg_replace('/<[^<]*> /',"",$result2);

//Write to file
$myfile = fopen("f".$k.$kk.".txt", "w") or die("Unable to open file!");
fwrite($myfile, $result1);
fwrite($myfile,"\r\n ");
foreach($result2 as $vvv){
fwrite($myfile, $vvv);
}

fclose($myfile);

}
}


php


require_once ‘phpspider/autoloader.php‘;
use phpspider\core\phpspider;
use phpspider\core\requests;
use phpspider\core\selector;
/* Do NOT delete this comment */
/* Do not delete this comment*/
requests
::$input_encoding = ‘GB2312‘;
requests
::$output_encoding = ‘GB2312‘;
//Get the article url of the blog article list
for($i=1;$i<=10;$i++){
$url = "https://www.cnblogs.com/jcydd/default.html?page=".$i;
$html = requests::get($url );
//var_dump($html);
$selector = "//";
$result[] = selector::select($html, $selector,'regex');
}
//var_dump($result);

//According to the url loop to get the article title and content

foreach($result as $k=> $v){
foreach($v as $kk=>$vv< span style="color: #000000;">){

$html1 = requests::get($vv );
//var_dump($html1);

//To get the article title, add @ before and after the regular expression, I don’t know why

$selector1 = "@(.*)< /a>@";
$result1 = selector::select($html1 , $selector1,'regex');
//var_dump($result1);

//There is a problem with the regularity of the article content, and some content cannot be obtained

$selector2 = "@((.| \n)*)@";
$result2 = selector::select($html1 , $selector2,'regex');
//var_dump($result2);
//Remove the html tags in the content of the article

$result2=preg_replace('/<[^<]*> /',"",$result2);

//Write to file
$myfile = fopen("f".$k.$kk.".txt", "w") or die("Unable to open file!");
fwrite($myfile, $result1);
fwrite($myfile,"\r\n ");
foreach($result2 as $vvv){
fwrite($myfile, $vvv);
}

fclose($myfile);

}
}


Leave a Comment

Your email address will not be published.