Python 并发编程实战 使用队列(Queue)实现生产者、消费者多线程爬虫

Python 并发编程实战 使用队列(Queue)实现生产者、消费者多线程爬虫

本文主要介绍 Python 并发编程实战 使用队列(Queue)实现生产者、消费者爬虫, 提升程序运行速度。爬取cnblogs文章,并保存到指定文件!

cnblogs.py

import requests
from bs4 import BeautifulSoup

headers = {
    "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = [
    f"https://www.cnblogs.com/#p{page}"
    for page in range(1,50+1)
]

# 获取html
def craw(url):
    r = requests.get(url, headers=headers)
    # print(url, len(r.text))
    return r.text

# 解析标题和url
def parse(html):
    soup = BeautifulSoup(html, "html.parser")
    links = soup.find_all("a", class_="post-item-title")
    return [(link["href"], link.get_text()) for link in links]

spider.py

import cnblogs
import queue
import time
import random
import threading

# 生产者
def do_craw(url_queue: queue.Queue, html_queue: queue.Queue):
    while True:
        # 从队列里获取url
        url = url_queue.get()
        # 获取html
        html = cnblogs.craw(url)
        # 放入html队列
        html_queue.put(html)
        # 打印线程名,当前的url,url队列的数量
        print(threading.current_thread().name, f"craw {url}", "url_queue.size=", url_queue.qsize())
        # 随机延迟
        time.sleep(random.randint(1,2))


# 消费者
def do_parse(html_queue: queue.Queue, fout):
    while True:
        # 从html队列里获取html
        html = html_queue.get()
        # 解析出链接和标题
        results = cnblogs.parse(html)
        # 数据写入到文件
        for result in results:
            fout.write(str(result) + "\n" )
        
        # 打印线程名,当前的url,url队列的数量
        print(threading.current_thread().name, f"results.size",len(results), "html_queue.size=", html_queue.qsize())
        # 随机延迟
        time.sleep(random.randint(1,2))


if __name__ == "__main__":
    # url队列
    url_queue = queue.Queue()
    # html队列
    html_queue = queue.Queue()
    # 保存的文件
    fout = open("data.txt", "w")

    # 把要采集的url加入队列
    for url in cnblogs.urls:
        url_queue.put(url)

    # 创建三个生产者线程
    for idx in range(3):
        t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw{idx}")
        t.start()

    # 创建二个消费者线程
    for idx in range(2):
        t = threading.Thread(target=do_parse, args=(html_queue, fout), name=f"parse{idx}")
        t.start()

抓取过程

(venv) fujie@fujuhaos-MacBook-Pro producer_consumer_spider % python start.py 
craw0 craw https://www.cnblogs.com/#p1 url_queue.size= 47
craw2 craw https://www.cnblogs.com/#p3 url_queue.size= 47
parse0 results.size 20 html_queue.size= 0
craw1 craw https://www.cnblogs.com/#p2 url_queue.size= 47
parse1 results.size 20 html_queue.size= 1
craw2 craw https://www.cnblogs.com/#p4 url_queue.size= 46
parse0 results.size 20 html_queue.size= 1
craw0 craw https://www.cnblogs.com/#p5 url_queue.size= 45
parse1 results.size 20 html_queue.size= 1
craw1 craw https://www.cnblogs.com/#p6 url_queue.size= 44
craw2 craw https://www.cnblogs.com/#p7 url_queue.size= 43
parse0 results.size 20 html_queue.size= 2
craw0 craw https://www.cnblogs.com/#p8 url_queue.size= 42
parse1 results.size 20 html_queue.size= 2
craw1 craw https://www.cnblogs.com/#p9 url_queue.size= 41
craw2 craw https://www.cnblogs.com/#p10 url_queue.size= 40
parse0 results.size 20 html_queue.size= 2
parse1 results.size 20 html_queue.size= 2
craw0 craw https://www.cnblogs.com/#p11 url_queue.size= 39
craw1 craw https://www.cnblogs.com/#p12 url_queue.size= 38
craw2 craw https://www.cnblogs.com/#p13 url_queue.size= 37
parse0 results.size 20 html_queue.size= 3
parse1 results.size 20 html_queue.size= 3
craw0 craw https://www.cnblogs.com/#p14 url_queue.size= 36
craw2 craw https://www.cnblogs.com/#p15 url_queue.size= 35
craw1 craw https://www.cnblogs.com/#p16 url_queue.size= 34
parse1 results.size 20 html_queue.size= 5
craw2 craw https://www.cnblogs.com/#p17 url_queue.size= 33
craw1 craw https://www.cnblogs.com/#p18 url_queue.size= 32
parse0 results.size 20 html_queue.size= 6
parse1 results.size 20 html_queue.size= 5
craw0 craw https://www.cnblogs.com/#p19 url_queue.size= 31
craw2 craw https://www.cnblogs.com/#p20 url_queue.size= 30
parse1 results.size 20 html_queue.size= 6
parse0 results.size 20 html_queue.size= 5
craw1 craw https://www.cnblogs.com/#p21 url_queue.size= 29
craw0 craw https://www.cnblogs.com/#p22 url_queue.size= 28
craw2 craw https://www.cnblogs.com/#p23 url_queue.size= 27
parse1 results.size 20 html_queue.size= 7
craw2 craw https://www.cnblogs.com/#p24 url_queue.size= 26
parse0 results.size 20 html_queue.size= 7
craw1 craw https://www.cnblogs.com/#p25 url_queue.size= 25
craw0 craw https://www.cnblogs.com/#p26 url_queue.size= 24
craw2 craw https://www.cnblogs.com/#p27 url_queue.size= 23
parse1 results.size 20 html_queue.size= 9
parse0 results.size 20 html_queue.size= 8
craw1 craw https://www.cnblogs.com/#p29 url_queue.size= 21
craw2 craw https://www.cnblogs.com/#p28 url_queue.size= 21
parse1 results.size 20 html_queue.size= 9
craw0 craw https://www.cnblogs.com/#p30 url_queue.size= 20
parse0 results.size 20 html_queue.size= 9
craw1 craw https://www.cnblogs.com/#p31 url_queue.size= 19
craw2 craw https://www.cnblogs.com/#p32 url_queue.size= 17
craw1 craw https://www.cnblogs.com/#p33 url_queue.size= 17
parse1 results.size 20 html_queue.size= 11
craw0 craw https://www.cnblogs.com/#p34 url_queue.size= 16
parse0 results.size 20 html_queue.size= 11
parse1 results.size 20 html_queue.size= 10
craw0 craw https://www.cnblogs.com/#p35 url_queue.size= 15
parse0 results.size 20 html_queue.size= 10
craw2 craw https://www.cnblogs.com/#p36 url_queue.size= 13
craw1 craw https://www.cnblogs.com/#p37 url_queue.size= 13
craw0 craw https://www.cnblogs.com/#p38 url_queue.size= 12
parse0 results.size 20 html_queue.size= 12
parse1 results.size 20 html_queue.size= 11
craw0 craw https://www.cnblogs.com/#p39 url_queue.size= 11
craw2 craw https://www.cnblogs.com/#p40 url_queue.size= 9
craw1 craw https://www.cnblogs.com/#p41 url_queue.size= 9
craw0 craw https://www.cnblogs.com/#p42 url_queue.size= 8
parse0 results.size 20 html_queue.size= 14
parse1 results.size 20 html_queue.size= 13
craw0 craw https://www.cnblogs.com/#p43 url_queue.size= 7
craw2 craw https://www.cnblogs.com/#p44 url_queue.size= 5
craw1 craw https://www.cnblogs.com/#p45 url_queue.size= 5
parse1 results.size 20 html_queue.size= 15
parse0 results.size 20 html_queue.size= 14
craw2 craw https://www.cnblogs.com/#p46 url_queue.size= 4
craw0 craw https://www.cnblogs.com/#p47 url_queue.size= 3
parse0 results.size 20 html_queue.size= 15
craw1 craw https://www.cnblogs.com/#p48 url_queue.size= 2
parse1 results.size 20 html_queue.size= 15
craw0 craw https://www.cnblogs.com/#p49 url_queue.size= 1
craw2 craw https://www.cnblogs.com/#p50 url_queue.size= 0
parse0 results.size 20 html_queue.size= 16
parse1 results.size 20 html_queue.size= 15
parse0 results.size 20 html_queue.size= 14
parse1 results.size 20 html_queue.size= 13
parse0 results.size 20 html_queue.size= 12
parse1 results.size 20 html_queue.size= 11
parse0 results.size 20 html_queue.size= 10
parse1 results.size 20 html_queue.size= 9
parse0 results.size 20 html_queue.size= 8
parse1 results.size 20 html_queue.size= 7
parse0 results.size 20 html_queue.size= 6
parse1 results.size 20 html_queue.size= 5
parse0 results.size 20 html_queue.size= 4
parse1 results.size 20 html_queue.size= 3
parse0 results.size 20 html_queue.size= 2
parse1 results.size 20 html_queue.size= 1
parse0 results.size 20 html_queue.size= 0
Loading...