IT序号网

分享一个简单的爬虫

qq123 2021年05月25日 编程语言 299 0

实现对今日头条街拍近400条数据的爬取,感兴趣的朋友可以尝试一下

import requests 
 
from urllib.parse import urlencode 
import os 
from hashlib import md5 
from multiprocessing.pool import Pool 
 
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' 
headers = { 'User-Agent' : user_agent } 
 
def get_page(offset): 
    params = { 
        'offset': offset, 
        'format': 'json', 
        'keyword': '街拍', 
        'autoload': 'true', 
        'count': '20', 
        'cur_tab': '1', 
         
    } 
    url = 'http://www.toutiao.com/search_content/?' + urlencode(params) 
    try: 
        response = requests.get(url, headers=headers) 
        if response.status_code == 200: 
            return response.json() 
    except requests.ConnectionError: 
        return None 
 
def get_images(json): 
    if json.get('data'): 
        for item in json.get('data'): 
            title = item.get('title') 
            images = item.get('image_list') 
            if title and images: 
                for image in images: 
                    yield { 
                        'image': image.get('url'), 
                        'title': title 
                    } 
 
 
def save_image(item): 
    if not os.path.exists(item.get('title')): 
        os.mkdir(item.get('title')) 
    try: 
        url = 'http:' + item.get('image') 
        response = requests.get(url, headers=headers) 
        if response.status_code == 200: 
            file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg')  
            if not os.path.exists(file_path): 
                with open(file_path, 'wb') as f: 
                    f.write(response.content) 
            else: 
                print('Already Download', file_path) 
    except requests.ConnectionError: 
        print('Failed to save Image') 
 
def main(offset): 
    json = get_page(offset) 
    if json: 
        for item in get_images(json): 
            # print(item) 
            save_image(item) 
 
GROUP_START = 1 
GROUP_END = 20 
 
if __name__ == '__main__': 
    pool = Pool() 
    groups = ([x*20 for x in range(GROUP_START, GROUP_END + 1)]) 
    pool.map(main, groups) 
    pool.close() 
    pool.join()


评论关闭
IT序号网

微信公众号号:IT虾米 (左侧二维码扫一扫)欢迎添加!