目标
地址:https://zc-paimai.taobao.com/
需求:使用爬虫抓取淘宝法拍房数据,针对北上广三个城市进行全量采集,数据不低于500条
字段:标题、价格、地区
思路
前期工作
定位接口
精简请求
一共4个参数需要逆向
参数逆向
_m_h5_tk和_m_h5_tk_enc
cookie来源分为2种,后端返回与前端生成,先验证第一种。
很好,这说明两个参数都是后端返回的,逆向进度直接完成一半!
注意这个请求即使没有返回数据,其cookie也是有用的,而且虽然里面有body和params并且看起来加过密,但经过多次尝试,采用固定参数也可以稳定长期的返回所需要cookie值。
sign
定位加密位置
但没有结果, 换个搜法,搜索sign:
下断点成功断住,很像32位的md5加密
通过测试,确实是标准的md5加密,接下来研究下参数o.token + "&" + a + "&" + s + "&" + n.data
的由来。
o.token
搜索o=
有160几个,不太好精准定位,所以看当前栈的信息,找到o=this.options
,搜索this.options
,找到this.options = r(n || {}, f)
,执行这个函数
并没有显示token成员,因此推测token是后来加入的,对于一个object对象,加入成员一般有.成员和[成员]两种方式,先来尝试第一种
注意this.option
后续可能被赋给了别的变量,也就是换了马甲,因此搜.token=
,预防这种情况,共找到了4处结果,其中a(S)
和a(h)
看起来都有可能是生成函数。
结果显示o.token
是源于a(S),其中S是_m_h5_tk
a()
是从document.cookie中提取字符串,基本上可以判定取的cookie键就是_m_h5_tk
,用时下最流行的gpt来帮我解释一下
结果也验证了想法,到了这里还不能沾沾自喜,还需进一步处理,提取出_之前的那部分作为最终返回结果
前面分析过这个cookie是一个接口返回的,至此o.token
已经破案。
a
原来它就是一个时间戳。
s
n.data
总结一下sign值的由来,_m_h5_tk这个cookie值的_之前部分+”&”+当前的时间戳+”&”+appkey(固定值)+”&”+请求时的params中的data值,形成的字符串md5加密,之后我们直接用python模拟算法
t
翻页逻辑
地区切换逻辑
代码组装测试
这证明之前参数逆向和翻页及地区切换的逻辑假设都是正确的,下面贴上完整的代码。
完整代码
import json
import time
import pymongo
import requests
import hashlib
from loguru import logger
from redis.client import Redis
from jmespath import search
class Spider():
def __init__(self):
self.mongodb = pymongo.MongoClient()
self.db = self.mongodb['spider']
self.coll = self.db['court_auction']
self.redis_cli = Redis()
self.location_change = {
'beiJin': {'locationCode': '110000',
'sid': '0272474869_1684412604030'},
'shangHai': {
'locationCode': '310000',
'sid': '1373475548_1684412676879'
},
'guangZhou': {'locationCode': '440000',
'sid': '8608688863_1684412736958'}
}
def md5_enc(self, str):
md5 = hashlib.md5()
md5.update(str.encode('utf-8'))
return md5.hexdigest()
def get_cookie(self):
headers = {
'Content-type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
'Referer': 'https://zc-paimai.taobao.com/',
}
params = (
('jsv', '2.6.1'),
('appKey', '12574478'),
('t', '1684378620773'),
('sign', '796d54577ac8db0af55dab2f0a06555d'),
('bxPageId', '1910955'),
('api', 'mtop.taobao.datafront.invoke.auctionwalle'),
('v', '1.0'),
('type', 'originaljson'),
('dataType', 'json'),
('requiredParams', 'dfApiName,dfUniqueId'),
)
data = {
'data': '{"dfApp":"auctionwalle","dfApiName":"auctionwalle.datou.getPageModulesData","dfVariables":"{\\"pageId\\":1910955,\\"moduleIds\\":\\"4394607430,4489817680,4480874310,2004318340,4529967930,2708524060,global\\",\\"context\\":{\\"pmid\\":\\"7287198777_1684376275117\\",\\"pmtk\\":\\"20140647.0.0.0.27064540.puimod-pc-search-navbar_5143927030.vault-jump\\",\\"spm\\":\\"a2129.27064540.puimod-pc-search-navbar_5143927030.vault-jump\\",\\"path\\":\\"27064540\\",\\"keyword\\":\\"\u4E8C\u624B\u623F\\",\\"page\\":\\"2\\",\\"userInfo\\":\\"{}\\",\\"sceneCode\\":\\"20210823QCG72BUD\\",\\"firstScreen\\":\\"true\\",\\"device\\":\\"pc\\"}}","dfUniqueId":"1910955.4394607430,4489817680,4480874310,2004318340,4529967930,2708524060,global","dfVariablesRecover":"{}"}'
}
url='https://h5api.m.taobao.com/h5/mtop.taobao.datafront.invoke.auctionwalle/1.0/'
logger.debug('url: ' + url)
response = requests.post(url,headers=headers, params=params, data=data)
logger.debug('cookie: ' + json.dumps(response.cookies.get_dict(), ensure_ascii=False, indent=2))
return response.cookies.get_dict()
def get_data(self, cookie, page, location):
cur_timeStamp = str(round(time.time() * 1000))
headers = {
"content-type": "application/x-www-form-urlencoded",
"referer": "https://zc-paimai.taobao.com/",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
}
url = "https://h5api.m.taobao.com/h5/mtop.taobao.datafront.invoke.auctionwalle/1.0/"
logger.debug('url: ' + url)
data = {
"data": "{\"dfApp\":\"auctionwalle\",\"dfApiName\":\"auctionwalle.datou.getPageModulesData\",\"dfVariables\":\"{\\\"pageId\\\":1910955,\\\"moduleIds\\\":\\\"2004318340:items\\\",\\\"context\\\":{\\\"_b_2004318340:items\\\":\\\"{\\\\\\\"keyword\\\\\\\":\\\\\\\"二手房\\\\\\\",\\\\\\\"pmid\\\\\\\":\\\\\\\"3840625134_1684412570290\\\\\\\",\\\\\\\"pmtk\\\\\\\":\\\\\\\"20140647.0.0.0.27064540.puimod-pc-search-navbar_5143927030.vault-jump\\\\\\\",\\\\\\\"spm\\\\\\\":\\\\\\\"a2129.27064540.puimod-pc-search-navbar_5143927030.vault-jump\\\\\\\",\\\\\\\"path\\\\\\\":\\\\\\\"27064540\\\\\\\",\\\\\\\"locationCodes\\\\\\\":[\\\\\\\"%(locationCode)s\\\\\\\"],\\\\\\\"userInfo\\\\\\\":{},\\\\\\\"appendMap\\\\\\\":{\\\\\\\"sid\\\\\\\":\\\\\\\"%(sid)s\\\\\\\"},\\\\\\\"page\\\\\\\":\\\\\\\"%(page)s\\\\\\\"}\\\",\\\"userInfo\\\":\\\"{}\\\",\\\"device\\\":\\\"pc\\\",\\\"sceneCode\\\":\\\"20210823QCG72BUD\\\"}}\",\"dfUniqueId\":\"1910955.2004318340:items\",\"dfVariablesRecover\":\"{}\"}" % {
'page': page, 'locationCode': location['locationCode'], 'sid': location['sid']}
}
params = {
"jsv": "2.6.1",
"appKey": "12574478",
"t": cur_timeStamp,
"sign": self.md5_enc(
cookie['_m_h5_tk'].split('_', 1)[0] + "&" + cur_timeStamp + "&" + "12574478" + "&" + data[
'data']).encode(
'utf-8'),
"bxPageId": "1910955",
"api": "mtop.taobao.datafront.invoke.auctionwalle",
"v": "1.0",
"type": "originaljson",
"dataType": "json",
"requiredParams": "dfApiName,dfUniqueId"
}
response = requests.post(url, headers=headers, cookies=cookie, params=params, data=data)
return response
def parse(self, res, city):
data2save = []
items = search('data.data.GQL_getPageModulesData."2004318340".items.schemeList', res.json())
for item in items:
title = search('auctionTitle', item)
price = str(search('price', item)) + search('priceUnit', item)
location = search('auctionBenefits', item)
data2save.append({
'city': city,
'title': title,
'price': price,
'location': location
})
logger.debug('data: ' + json.dumps(data2save, indent=2, ensure_ascii=False))
return data2save
def save(self, data):
"""用redis set特性过滤避免重复插入,sadd如果有则返回1,否则是0"""
for i in data:
if self.redis_cli.sadd('court_auction', json.dumps(i)):
self.coll.insert_one(i)
logger.debug('保存成功!')
def main(self):
cookie = self.get_cookie() # 这个cookie过期时间还挺长的,不用每次请求都变
for k, v in self.location_change.items():
for i in range(1, 10):
res = self.get_data(cookie, i, v)
data2save = self.parse(res, k)
self.save(data2save)
time.sleep(1)
if __name__ == '__main__':
spider = Spider()
spider.main()
转载请注明来源,欢迎对文章中的引用来源进行考证,欢迎指出任何有错误或不够清晰的表达,如有问题请邮件至2454612285@qq.com。