Xpath
在 XML 文档中查找信息的语言, 同样适用于 HTML
辅助工具
Xpath Helper
Chrome插件 快捷键 Ctrl + shift + x
XML Quire
xpath 编辑工具
Xpath 表达式
// 查找所有节点
//book
查找所有的book节点
/ 查找当前节点
//book/title
查找所有book节点下的title子节
@ 获取属性筛选
//book//title/@lang="en"
查找book节点下所有的title节点中,lang属性值为 "en"
[] 当前节点的限制
//bookstore/book[2]/title
查找bookstore下的第2个book节点下的title子节点
//title[@lang]
查找含有 lang 属性的 title节点
| 匹配多路径 (或匹配)
xpath表达式1 | xpath表达式2 | ...
contains() 匹配一个属性值中包含某些字符串的节点
//div[contains(@id,'qiushi_tag_')]
text() 匹配文本值
//book/title/text()="羊驼"
查找 book 下的 title 下文本为"羊驼"节点
lxml
安装
pip install lxml
使用
导入模块
from lxml import etree
创建解析对象
parse_html = etree.HTML(html)
调用 xpath 匹配
r_list = parse_html.xpath('xpath 表达式')
ps: 返回结果以列表形式
示例
抓取指定贴吧所有图片
- 获取贴吧主页URL,下一页,找URL规律
- 获取1页中所有帖子URL地址
- 对每个帖子链接发请求,获取图片URL
- 对每个图片链接发请求,以wb方式写入本地
import requestsfrom lxml import etreeclass BaiduSpider(object): def __init__(self): self.baseurl = 'http://tieba.baidu.com/f?' # 使用IE的User-Agent self.headers = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'} # 获取帖子链接 def get_turl(self, params): res = requests.get( self.baseurl, params=params, headers=self.headers ) res.encoding = 'utf-8' html = res.text # 提取帖子链接 parse_html = etree.HTML(html) t_list = parse_html.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href') print(t_list) # t_list : ['/p/23232/','/p/923423'] for t in t_list: url = 'http://tieba.baidu.com' + t # 提取图片链接,对图片链接发请求保存到本地 self.get_imgurl(url) # 获取图片链接 def get_imgurl(self, url): res = requests.get(url, headers=self.headers) res.encoding = 'utf-8' html = res.text # 提取图片链接列表 parse_html = etree.HTML(html) # 图片链接列表 img_list = parse_html.xpath( '//div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src | //div[@class="video_src_wrapper"]/embed/@data-video') # 视频链接列表 # img_list = parse_html.xpath('//div[@class="video_src_wrapper"]/embed/@data-video') print(img_list) for img in img_list: self.write_img(img) # 把图片保存到本地 def write_img(self, img): res = requests.get(img, headers=self.headers) res.encoding = 'utf-8' html = res.content # 保存到本地 filename = img[-10:] with open(filename, 'wb') as f: f.write(html) print('%s下载成功' % filename) # 主函数 def main(self): name = input('贴吧名:') begin = int(input('起始页:')) end = int(input('终止页:')) for page in range(begin, end + 1): pn = (page - 1) * 50 # 定义查询参数 params = { 'kw': name, 'pn': str(pn) } self.get_turl(params)if __name__ == '__main__': spider = BaiduSpider() spider.main()
猫眼电影信息爬取
from urllib import requestimport timeimport csvfrom lxml import etreeclass MaoyanSpider(object): def __init__(self): self.baseurl = 'https://maoyan.com/board/4?offset=' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'} # 爬取页数计数 self.page = 1 # 获取页面 def get_page(self, url): req = request.Request(url, headers=self.headers) res = request.urlopen(req) html = res.read().decode('utf-8') # 直接调用解析函数 self.parse_page(html) # 解析页面 def parse_page(self, html): parse_html = etree.HTML(html) # 基准xpath,匹配每个电影信息节点对象列表 dd_list = parse_html.xpath('//dl[@class="board-wrapper"]/dd') # dd_list : [,<...>] for dd in dd_list: name = dd.xpath('./a/@title')[0].strip() star = dd.xpath('.//p[@class="star"]/text()')[0].strip() time = dd.xpath('.//p[@class="releasetime"]/text()')[0].strip() print([name, star, time]) # 保存数据(存到csv文件) def write_page(self, r_list): # r_list : [(),(),()] with open('猫眼.csv', 'a') as f: writer = csv.writer(f) for rt in r_list: film = [ rt[0].strip(), rt[1].strip(), rt[2].strip() ] writer.writerow(film) # 主函数 def main(self): # 用range函数可获取某些查询参数的值 for offset in range(0, 41, 10): url = self.baseurl + str(offset) self.get_page(url) print('第%d页爬取成功' % self.page) self.page += 1 time.sleep(1)if __name__ == '__main__': spider = MaoyanSpider() spider.main()