多进程爬虫_微信好师傅Python

近日因工作需求,要爬取微信公众号的一些信息,通过charles抓包之后发现一些请求参数已加密,奈何安卓的逆向咱又不懂,只能选笨办法遍历参数。

单进程速度感人,遂上多线程,以下为小脚本,希望抛砖引玉 – – 以下

import requests
import re,os
import random
from multiprocessing import Pool
import json
def worker():
    for number_six_random in range(1,99999):
        try:  # 错误处理
            if len(str(number_six_random)) != 6:
                number_six_random_len = (int(6)-len(str(number_six_random)))*'0'+str(number_six_random)
                url = "https://www.52hsfdj.com/haoshifu/userWorker/getUser/"+"300"+number_six_random_len+"?token=2aabb0bba934482b9377d486decad4c2&digest=204080a136f2ecee81f0b4624a2d0af81cd0f55508924803011ef36c62ae307e&platformSource=wxapp"
                headers={
                    "user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36"
                }
                response_data = requests.get(url=url,headers=headers)
                text_word = response_data.text #获取URL内容
                out_path = 'i://WX_好师傅到家爬虫数据流_0315.txt'  # 数据存储目录
                with open (out_path,"a+",encoding='utf-8') as f:#写入到txt指定文档中
                    f.write(text_word+'\n')

            else:
                headers = {
                    "user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36"
                }
                url = "https://www.52hsfdj.com/haoshifu/userWorker/getUser/"+"300"+str(number_six_random)+"?token=2aabb0bba934482b9377d486decad4c2&digest=204080a136f2ecee81f0b4624a2d0af81cd0f55508924803011ef36c62ae307e&platformSource=wxapp"
                response_data = requests.get(url=url, headers=headers)

        except:
            print('当前爬取遇到错误\n错误页数为:'+str(number_six_random))
            continue
    # 堵塞的进程
if __name__ == "__main__":
    po = Pool(20)  # 定义一个进程池,最大进程数3
    for i in range(0, 20):
        po.apply(worker)  # 等待上一次任务完成之后再次添加新的任务,堵塞式添加
    print("---start---")
    po.close()  # 关闭进程池,关闭后po不再接受新的请求
    po.join()  # 等待po中所有子进程执行完成,必须放
    # 线程池

添加评论

您的电子邮箱地址不会被公开。 必填项已用*标注