Simple use of PHPSPIDER acquisition this blog article content - article, Blog, Book, collection, Content, phpspider, simple, use

Collection process

Acquire page content according to the link (curl)->Get the content that needs to be collected (can be filtered by regular, xpath, css selector, etc.)

< div class="code">

php



require_once ‘phpspider/autoloader.php‘;

use phpspider\core\phpspider;

use phpspider\core\requests;

use phpspider\core\selector;

/* Do NOT delete this comment */

/* Do not delete this comment*/

requests::$input_encoding = ‘GB2312‘;

requests::$output_encoding = ‘GB2312‘;

//Get the article url of the blog article list

for($i=1;$i<=10;$i++){

$url = "https://www.cnblogs.com/jcydd/default.html?page=".$i;

$html = requests::get($url );

//var_dump($html);

$selector = "//";

$result[] = selector::select($html, $selector,'regex');

}

//var_dump($result);



//According to the url loop to get the article title and content

 foreach($result as $k=> $v){

 foreach($v as $kk=>$vv< span style="color: #000000;">){



 $html1 = requests::get($vv );

 //var_dump($html1);



 //To get the article title, add @ before and after the regular expression, I don’t know why

 $selector1 = "@(.*)< /a>@";

 $result1 = selector::select($html1 , $selector1,'regex');

 //var_dump($result1);



 //There is a problem with the regularity of the article content, and some content cannot be obtained 

 $selector2 = "@((.| \n)*)@";

 $result2 = selector::select($html1 , $selector2,'regex');

 //var_dump($result2);

 //Remove the html tags in the content of the article 

 $result2=preg_replace('/<[^<]*> /',"",$result2);



 //Write to file 

 $myfile = fopen("f".$k.$kk.".txt", "w") or die("Unable to open file!");

 fwrite($myfile, $result1);

 fwrite($myfile,"\r\n ");

 foreach($result2 as $vvv){

 fwrite($myfile, $vvv);

}



 fclose($myfile);



}

}

php

require_once ‘phpspider/autoloader.php‘;
use phpspider\core\phpspider;
use phpspider\core\requests;
use phpspider\core\selector;
/* Do NOT delete this comment */
/* Do not delete this comment*/
requests::$input_encoding = ‘GB2312‘;
requests::$output_encoding = ‘GB2312‘;
//Get the article url of the blog article list
for($i=1;$i<=10;$i++){
$url = "https://www.cnblogs.com/jcydd/default.html?page=".$i;
$html = requests::get($url );
//var_dump($html);
$selector = "//";
$result[] = selector::select($html, $selector,'regex');
}
//var_dump($result);

//According to the url loop to get the article title and content
foreach($result as $k=> $v){
foreach($v as $kk=>$vv< span style="color: #000000;">){

$html1 = requests::get($vv );
//var_dump($html1);

//To get the article title, add @ before and after the regular expression, I don’t know why
$selector1 = "@(.*)< /a>@";
$result1 = selector::select($html1 , $selector1,'regex');
//var_dump($result1);

//There is a problem with the regularity of the article content, and some content cannot be obtained
$selector2 = "@((.| \n)*)@";
$result2 = selector::select($html1 , $selector2,'regex');
//var_dump($result2);
//Remove the html tags in the content of the article
$result2=preg_replace('/<[^<]*> /',"",$result2);

//Write to file
$myfile = fopen("f".$k.$kk.".txt", "w") or die("Unable to open file!");
fwrite($myfile, $result1);
fwrite($myfile,"\r\n ");
foreach($result2 as $vvv){
fwrite($myfile, $vvv);
}

fclose($myfile);

}
}

Leave a Comment Cancel reply