Collection process
Acquire page content according to the link (curl)->Get the content that needs to be collected (can be filtered by regular, xpath, css selector, etc.)
< div class="code">
php
require_once ‘phpspider/autoloader.php‘;
use phpspider\core\phpspider;
use phpspider\core\requests;
use phpspider\core\selector;
/* Do NOT delete this comment */
/* Do not delete this comment*/
requests::$input_encoding = ‘GB2312‘;
requests::$output_encoding = ‘GB2312‘;
//Get the article url of the blog article list
for($i=1;$i<=10;$i++){
$url = "https://www.cnblogs.com/jcydd/default.html?page=".$i;
$html = requests::get($url );
//var_dump($html);
$selector = "//";
$result[] = selector::select($html span>, $selector,'regex');
}
//var_dump($result);
//According to the url loop to get the article title and content
foreach($result as $k=> $v){
foreach($v as $kk=>$vv< span style="color: #000000;">){
$html1 = requests::get($vv );
//var_dump($html1);
//To get the article title, add @ before and after the regular expression, I don’t know why
$selector1 = "@(.*)< /a>@";
$result1 = selector::select($html1 , $selector1,'regex');
//var_dump($result1);
//There is a problem with the regularity of the article content, and some content cannot be obtained
$selector2 = "@((.| \n)*)@";
$result2 = selector::select($html1 , $selector2,'regex');
//var_dump($result2);
//Remove the html tags in the content of the article
$result2=preg_replace('/<[^<]*> /',"",$result2);
//Write to file
$myfile = fopen("f".$k.$kk.".txt", "w") or die("Unable to open file!");
fwrite($myfile, $result1);
fwrite($myfile,"\r\n ");
foreach($result2 as $vvv){
fwrite($myfile, $vvv);
}
fclose($myfile);
}
}
php
require_once ‘phpspider/autoloader.php‘;
use phpspider\core\phpspider;
use phpspider\core\requests;
use phpspider\core\selector;
/* Do NOT delete this comment */
/* Do not delete this comment*/
requests::$input_encoding = ‘GB2312‘;
requests::$output_encoding = ‘GB2312‘;
//Get the article url of the blog article list
for($i=1;$i<=10;$i++){
$url = "https://www.cnblogs.com/jcydd/default.html?page=".$i;
$html = requests::get($url );
//var_dump($html);
$selector = "//";
$result[] = selector::select($html span>, $selector,'regex');
}
//var_dump($result);
//According to the url loop to get the article title and content
foreach($result as $k=> $v){
foreach($v as $kk=>$vv< span style="color: #000000;">){
$html1 = requests::get($vv );
//var_dump($html1);
//To get the article title, add @ before and after the regular expression, I don’t know why
$selector1 = "@(.*)< /a>@";
$result1 = selector::select($html1 , $selector1,'regex');
//var_dump($result1);
//There is a problem with the regularity of the article content, and some content cannot be obtained
$selector2 = "@((.| \n)*)@";
$result2 = selector::select($html1 , $selector2,'regex');
//var_dump($result2);
//Remove the html tags in the content of the article
$result2=preg_replace('/<[^<]*> /',"",$result2);
//Write to file
$myfile = fopen("f".$k.$kk.".txt", "w") or die("Unable to open file!");
fwrite($myfile, $result1);
fwrite($myfile,"\r\n ");
foreach($result2 as $vvv){
fwrite($myfile, $vvv);
}
fclose($myfile);
}
}
WordPress database error: [Table 'yf99682.wp_s6mz6tyggq_comments' doesn't exist]
SELECT SQL_CALC_FOUND_ROWS wp_s6mz6tyggq_comments.comment_ID FROM wp_s6mz6tyggq_comments WHERE ( comment_approved = '1' ) AND comment_post_ID = 2063 ORDER BY wp_s6mz6tyggq_comments.comment_date_gmt ASC, wp_s6mz6tyggq_comments.comment_ID ASC