#!/usr/bin/env python  
# -*- coding: utf-8 -*- 
 
import requests 
from pyquery import PyQuery as pq 
 
 
url = 'http://www.136book.com/huaqiangu/' 
headers = { 
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36' 
                  ' (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' 
} 
 
# 请求网页链接 获取页面源码 
r = requests.get(url, headers=headers).text 
doc = pq(r) 
# 获取网页的全部章节链接 这里去掉了前面的最新章节部分 直接从第一章开始  
# div#book_detail:nth-child(2) 选取的是第二个div的内容 
links = doc('div#book_detail:nth-child(2) li a').items() 
 
for link in links: 
 
    download_url = link.attr('href') 
    # 请求每个章节 
    download_page = requests.get(download_url, headers=headers).text 
    # 获取每一章节的源码 
    doc = pq(download_page) 
    # 获取每一章小说的内容 
    contents = doc('div#content').text() 
    with open('花千骨.txt', 'a+', encoding='utf8') as f: 
        f.write(link.text()+"\n\n") 
        f.write(contents+"\n\n") 
print("写入文件完成!请查看")

1.安装方法

pip install pyquery

2.引用方法

from pyquery import PyQuery as pq

3.简介

 pyquery 是类型jquery 的一个专供python使用的html解析的库,使用方法类似bs4。

4.使用方法

  4.1 初始化方法:

from pyquery import PyQuery as pq 
doc =pq(html) #解析html字符串 
doc =pq("http://news.baidu.com/") #解析网页 
doc =pq("./a.html") #解析html 文本

      4.2 基本CSS选择器

from pyquery import PyQuery as pq 
html = ''' 
    <div id="wrap"> 
        <ul class="s_from"> 
            asdasd 
            <link href="http://asda.com">asdadasdad12312</link> 
            <link href="http://asda1.com">asdadasdad12312</link> 
            <link href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
    </div> 
''' 
doc = pq(html) 
print doc("#wrap .s_from link")

  运行结果:

<link href="http://asda.com">asdadasdad12312</link> 
            <link href="http://asda1.com">asdadasdad12312</link> 
            <link href="http://asda2.com">asdadasdad12312</link>

  #是查找id的标签  .是查找class 的标签  link 是查找link 标签 中间的空格表示里层

  4.3 查找子元素

from pyquery import PyQuery as pq 
html = ''' 
    <div id="wrap"> 
        <ul class="s_from"> 
            asdasd 
            <link href="http://asda.com">asdadasdad12312</link> 
            <link href="http://asda1.com">asdadasdad12312</link> 
            <link href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
    </div> 
''' 
#查找子元素 
doc = pq(html) 
items=doc("#wrap") 
print(items) 
print("类型为:%s"%type(items)) 
link = items.find('.s_from') 
print(link) 
link = items.children() 
print(link)

  运行结果:

<div id="wrap"> 
        <ul class="s_from"> 
            asdasd 
            <link href="http://asda.com">asdadasdad12312</link> 
            <link href="http://asda1.com">asdadasdad12312</link> 
            <link href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
    </div> 
类型为:<class 'pyquery.pyquery.PyQuery'> 
<ul class="s_from"> 
            asdasd 
            <link href="http://asda.com">asdadasdad12312</link> 
            <link href="http://asda1.com">asdadasdad12312</link> 
            <link href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
     
<ul class="s_from"> 
            asdasd 
            <link href="http://asda.com">asdadasdad12312</link> 
            <link href="http://asda1.com">asdadasdad12312</link> 
            <link href="http://asda2.com">asdadasdad12312</link> 
        </ul>

  根据运行结果可以发现返回结果类型为pyquery,并且find方法和children 方法都可以获取里层标签

  4.4查找父元素

from pyquery import PyQuery as pq 
html = ''' 
    <div href="wrap"> 
        hello nihao 
        <ul class="s_from"> 
            asdasd 
            <link href="http://asda.com">asdadasdad12312</link> 
            <link href="http://asda1.com">asdadasdad12312</link> 
            <link href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
    </div> 
''' 
 
doc = pq(html) 
items=doc(".s_from") 
print(items) 
#查找父元素 
parent_href=items.parent() 
print(parent_href)

  运行结果:

<ul class="s_from"> 
            asdasd 
            <link href="http://asda.com">asdadasdad12312</link> 
            <link href="http://asda1.com">asdadasdad12312</link> 
            <link href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
     
<div href="wrap"> 
        hello nihao 
        <ul class="s_from"> 
            asdasd 
            <link href="http://asda.com">asdadasdad12312</link> 
            <link href="http://asda1.com">asdadasdad12312</link> 
            <link href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
    </div>

  parent可以查找出外层标签包括的内容,与之类似的还有parents,可以获取所有外层节点

  4.5 查找兄弟元素

from pyquery import PyQuery as pq 
html = ''' 
    <div href="wrap"> 
        hello nihao 
        <ul class="s_from"> 
            asdasd 
            <link class='active1 a123' href="http://asda.com">asdadasdad12312</link> 
            <link class='active2' href="http://asda1.com">asdadasdad12312</link> 
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
    </div> 
''' 
 
doc = pq(html) 
items=doc("link.active1.a123") 
print(items) 
#查找兄弟元素 
siblings_href=items.siblings() 
print(siblings_href)

  运行结果:

<link class="active1 a123" href="http://asda.com">asdadasdad12312</link> 
             
<link class="active2" href="http://asda1.com">asdadasdad12312</link> 
            <link class="movie1" href="http://asda2.com">asdadasdad12312</link>

  根据运行结果可以看出,siblings 返回了同级的其他标签

  结论:子元素查找,父元素查找,兄弟元素查找,这些方法返回的结果类型都是pyquery类型,可以针对结果再次进行选择

  4.6 遍历查找结果

from pyquery import PyQuery as pq 
html = ''' 
    <div href="wrap"> 
        hello nihao 
        <ul class="s_from"> 
            asdasd 
            <link class='active1 a123' href="http://asda.com">asdadasdad12312</link> 
            <link class='active2' href="http://asda1.com">asdadasdad12312</link> 
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
    </div> 
''' 
 
doc = pq(html) 
its=doc("link").items() 
for it in its: 
    print(it)

  运行结果:

<link class="active1 a123" href="http://asda.com">asdadasdad12312</link> 
             
<link class="active2" href="http://asda1.com">asdadasdad12312</link> 
             
<link class="movie1" href="http://asda2.com">asdadasdad12312</link>

  4.7获取属性信息

from pyquery import PyQuery as pq 
html = ''' 
    <div href="wrap"> 
        hello nihao 
        <ul class="s_from"> 
            asdasd 
            <link class='active1 a123' href="http://asda.com">asdadasdad12312</link> 
            <link class='active2' href="http://asda1.com">asdadasdad12312</link> 
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
    </div> 
''' 
 
doc = pq(html) 
its=doc("link").items() 
for it in its: 
    print(it.attr('href')) 
    print(it.attr.href)

  运行结果:

http://asda.com 
http://asda.com 
http://asda1.com 
http://asda1.com 
http://asda2.com 
http://asda2.com

  4.8 获取文本

from pyquery import PyQuery as pq 
html = ''' 
    <div href="wrap"> 
        hello nihao 
        <ul class="s_from"> 
            asdasd 
            <link class='active1 a123' href="http://asda.com">asdadasdad12312</link> 
            <link class='active2' href="http://asda1.com">asdadasdad12312</link> 
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
    </div> 
''' 
 
doc = pq(html) 
its=doc("link").items() 
for it in its: 
    print(it.text())

  运行结果

asdadasdad12312 
asdadasdad12312 
asdadasdad12312

  4.9 获取 HTML信息

from pyquery import PyQuery as pq 
html = ''' 
    <div href="wrap"> 
        hello nihao 
        <ul class="s_from"> 
            asdasd 
            <link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link> 
            <link class='active2' href="http://asda1.com">asdadasdad12312</link> 
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
    </div> 
''' 
 
doc = pq(html) 
its=doc("link").items() 
for it in its: 
    print(it.html())

  运行结果:

<a>asdadasdad12312</a> 
asdadasdad12312 
asdadasdad12312

5.常用DOM操作

  5.1 addClass removeClass

  添加,移除class标签

from pyquery import PyQuery as pq 
html = ''' 
    <div href="wrap"> 
        hello nihao 
        <ul class="s_from"> 
            asdasd 
            <link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link> 
            <link class='active2' href="http://asda1.com">asdadasdad12312</link> 
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
    </div> 
''' 
 
doc = pq(html) 
its=doc("link").items() 
for it in its: 
    print("添加:%s"%it.addClass('active1')) 
    print("移除:%s"%it.removeClass('active1'))

  运行结果

添加:<link class="active1 a123" href="http://asda.com"><a>asdadasdad12312</a></link> 
             
移除:<link class="a123" href="http://asda.com"><a>asdadasdad12312</a></link> 
             
添加:<link class="active2 active1" href="http://asda1.com">asdadasdad12312</link> 
             
移除:<link class="active2" href="http://asda1.com">asdadasdad12312</link> 
             
添加:<link class="movie1 active1" href="http://asda2.com">asdadasdad12312</link> 
         
移除:<link class="movie1" href="http://asda2.com">asdadasdad12312</link>

  需要注意的是已经存在的class标签不会继续添加

  5.2 attr css

  attr 为获取/修改属性 css 添加style属性

from pyquery import PyQuery as pq 
html = ''' 
    <div href="wrap"> 
        hello nihao 
        <ul class="s_from"> 
            asdasd 
            <link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link> 
            <link class='active2' href="http://asda1.com">asdadasdad12312</link> 
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
    </div> 
''' 
 
doc = pq(html) 
its=doc("link").items() 
for it in its: 
    print("修改:%s"%it.attr('class','active')) 
    print("添加:%s"%it.css('font-size','14px'))

  运行结果

C:\Python27\python.exe D:/test_his/test_re_1.py 
修改:<link class="active" href="http://asda.com"><a>asdadasdad12312</a></link> 
             
添加:<link class="active" href="http://asda.com" style="font-size: 14px"><a>asdadasdad12312</a></link> 
             
修改:<link class="active" href="http://asda1.com">asdadasdad12312</link> 
             
添加:<link class="active" href="http://asda1.com" style="font-size: 14px">asdadasdad12312</link> 
             
修改:<link class="active" href="http://asda2.com">asdadasdad12312</link> 
         
添加:<link class="active" href="http://asda2.com" style="font-size: 14px">asdadasdad12312</link>

  attr css操作直接修改对象的

  5.3 remove

  remove 移除标签

from pyquery import PyQuery as pq 
html = ''' 
    <div href="wrap"> 
        hello nihao 
        <ul class="s_from"> 
            asdasd 
            <link class='active1 a123' href="http://asda.com"><a>asdadasdad12312</a></link> 
            <link class='active2' href="http://asda1.com">asdadasdad12312</link> 
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
    </div> 
''' 
 
doc = pq(html) 
its=doc("div") 
print('移除前获取文本结果:\n%s'%its.text()) 
it=its.remove('ul') 
print('移除后获取文本结果:\n%s'%it.text())

  运行结果

移除前获取文本结果: 
hello nihao 
asdasd 
asdadasdad12312 
asdadasdad12312 
asdadasdad12312 
移除后获取文本结果: 
hello nihao

  其他DOM方法参考:

  IT虾米网

6.伪类选择器

from pyquery import PyQuery as pq 
html = ''' 
    <div href="wrap"> 
        hello nihao 
        <ul class="s_from"> 
            asdasd 
            <link class='active1 a123' href="http://asda.com"><a>helloasdadasdad12312</a></link> 
            <link class='active2' href="http://asda1.com">asdadasdad12312</link> 
            <link class='movie1' href="http://asda2.com">asdadasdad12312</link> 
        </ul> 
    </div> 
''' 
 
doc = pq(html) 
its=doc("link:first-child") 
print('第一个标签:%s'%its) 
its=doc("link:last-child") 
print('最后一个标签:%s'%its) 
its=doc("link:nth-child(2)") 
print('第二个标签:%s'%its) 
its=doc("link:gt(0)") #从零开始 
print("获取0以后的标签:%s"%its) 
its=doc("link:nth-child(2n-1)") 
print("获取奇数标签:%s"%its) 
its=doc("link:contains('hello')") 
print("获取文本包含hello的标签:%s"%its)

  运行结果

第一个标签:<link class="active1 a123" href="http://asda.com"><a>helloasdadasdad12312</a></link> 
             
最后一个标签:<link class="movie1" href="http://asda2.com">asdadasdad12312</link> 
         
第二个标签:<link class="active2" href="http://asda1.com">asdadasdad12312</link> 
             
获取0以后的标签:<link class="active2" href="http://asda1.com">asdadasdad12312</link> 
            <link class="movie1" href="http://asda2.com">asdadasdad12312</link> 
         
获取奇数标签:<link class="active1 a123" href="http://asda.com"><a>helloasdadasdad12312</a></link> 
            <link class="movie1" href="http://asda2.com">asdadasdad12312</link> 
         
获取文本包含hello的标签:<link class="active1 a123" href="http://asda.com"><a>helloasdadasdad12312</a></link>


评论关闭
IT序号网

微信公众号号:IT虾米 (左侧二维码扫一扫)欢迎添加!

分享一个简单的爬虫