多进程爬虫_微信好师傅Python
近日因工作需求,要爬取微信公众号的一些信息,通过charles抓包之后发现一些请求参数已加密,奈何安卓的逆向咱又不懂,只能选笨办法遍历参数。
单进程速度感人,遂上多线程,以下为小脚本,希望抛砖引玉 – – 以下
import requests
import re,os
import random
from multiprocessing import Pool
import json
def worker():
for number_six_random in range(1,99999):
try: # 错误处理
if len(str(number_six_random)) != 6:
number_six_random_len = (int(6)-len(str(number_six_random)))*'0'+str(number_six_random)
url = "https://www.52hsfdj.com/haoshifu/userWorker/getUser/"+"300"+number_six_random_len+"?token=2aabb0bba934482b9377d486decad4c2&digest=204080a136f2ecee81f0b4624a2d0af81cd0f55508924803011ef36c62ae307e&platformSource=wxapp"
headers={
"user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36"
}
response_data = requests.get(url=url,headers=headers)
text_word = response_data.text #获取URL内容
out_path = 'i://WX_好师傅到家爬虫数据流_0315.txt' # 数据存储目录
with open (out_path,"a+",encoding='utf-8') as f:#写入到txt指定文档中
f.write(text_word+'\n')
else:
headers = {
"user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36"
}
url = "https://www.52hsfdj.com/haoshifu/userWorker/getUser/"+"300"+str(number_six_random)+"?token=2aabb0bba934482b9377d486decad4c2&digest=204080a136f2ecee81f0b4624a2d0af81cd0f55508924803011ef36c62ae307e&platformSource=wxapp"
response_data = requests.get(url=url, headers=headers)
except:
print('当前爬取遇到错误\n错误页数为:'+str(number_six_random))
continue
# 堵塞的进程
if __name__ == "__main__":
po = Pool(20) # 定义一个进程池,最大进程数3
for i in range(0, 20):
po.apply(worker) # 等待上一次任务完成之后再次添加新的任务,堵塞式添加
print("---start---")
po.close() # 关闭进程池,关闭后po不再接受新的请求
po.join() # 等待po中所有子进程执行完成,必须放
# 线程池