Graduate reptile (three) - First, Fluid, reptile, three

A paragraph:

import requests
url="https://en.wikipedia.org/wiki/Steve_Jobs"
res=requests.get(url )
print(res.status_code)
with open('a.html','w', encoding='utf-8') as f:
 f.write(res.text )

Save a web page. Due to the encoding of windows and python, you must specify encoding=’utf-8′ when opening.

One more paragraph:

import requests
import re
from lxml import etree
with open("a.html","r",encoding="utf-8") as f:
 c =f.read()
tree=etree.HTML(c)
table_element=tree.xpath("//table[@class='infobox biography vcard']")
table_row= tree.xpath("//table[@class='infobox biography vcard'][1]/tbody/tr")
pattern_attrib=re.compile("<.*?>")
# print(table_element)
# infobox biography vcard
for row in table_row:
 try:
 thead=row.xpath("th")[0]
 title= etree.tostring(thead).decode("utf-8")
 title=pattern_attrib.sub(" ",title)
 desc=row.xpath("td")[0]
 desc=etree.tostring(desc).decode("utf- 8")
 desc=pattern_attrib.sub(" ",desc)
 print(title+":"+desc)
 print("=========")< br /> except Exception as err:
 print(err)
 # pass
content=tree.xpath("//div[@id='mw-content-text'][1 ]//*[self::h2 or self::p]")
for line in content:
 print(line.text)

Leave a Comment Cancel reply