一、目标
- 地址:https://credit.hefei.gov.cn/credit-website/publicity/search-result/xyxx-list.do
- 需求:抓取该页面下站点数据2000个 采集字段:标题、信用代码、工商注册码
二、思路
通过F12抓包分析,找到目标接口:
https://credit.hefei.gov.cn/credit-website/publicity/topSearch/getXYXXPageInfo.do
这是一个post请求,请求参数有:
通过python模拟请求后发现,必须要携带cookie才能够请求成功,排查后发现主要是这两个cookie
__jsl_clearance_s
__jsluid_scookie加密一般有2种来源
后端接口返回
前端document.cookie生成首先尝试第一种,打开proxyman(抓包工具), 浏览器清掉网站缓存
在proxyman中的filter添加过滤条件,搜索
__jsluid_s
和__jsl_clearance_s
__jsluid_s成功搜索到,但
__jsl_clearance_s
未搜索到<br/>
但是发现有个可疑之处,怀疑这便是
__jsl_clearance_s
的来源<br>
在控制台执行这段脚本,验证了的猜想,这也给我提供了一种思路,也就是在response_body搜索
document.cookie
关键词来查找cookie设置位置,document不属于变量,无法被混淆!<br/>
紧接着带入这两个cookie,放入测试请求中验证,结果仍然是不对的,返回状态码是521,这时候意识到,可能这个cookie在中间还产生过变化
仔细对比了cookie后发现,
__jsl_clearance_s
是不一致的<br>
那么什么时候发生的呢? 仔细看一下抓包请求
<br>
打开一个空白页面执行下该段js,果然如此!
<br>
接下来准备扣出加密算法,
先尝试打断点调试,断不住,为什么,因为这是动态返回的代码,可以考虑替换,在替换文件中首行写下debugger;<br>
<br>
然后使用proxyman提供的替换功能<br>
<br>
成功断住<br>
js代码经过了混淆处理,但不影响寻找目标,在脚本内搜索document(必然会调用document.cookie),发现了一处可疑的地方
<br>
把这句代码复制出来,稍微认真一下
<br>
function foo() { document[_0x49d1('0x40', '4Kd0') + 'ie'] = _0xa88405[_0x49d1('0xa0', 'oT!N') + 'z'](_0xa88405[_0x49d1('0x5b', 'UOyd') + 'J'](_0xa88405[_0x49d1('0x63', 'x*XD') + 'Y'](_0xa88405[_0x49d1('0x93', '!3a3') + 'Y'](_0x4af155['tn'], '='), _0x38ba7c[0x0]), _0xa88405[_0x49d1('0x53', 'e@Cg') + 's']) + _0x4af155['vt'], _0x49d1('0x35', 'x*XD') + _0x49d1('0x3', ']#Kf') + '\x20/'); } /* _0x49d1('0x40', '4Kd0') + 'ie' 控制台执行结果为'cookie' _0x49d1('0xa0', 'oT!N') + 'z' 控制台执行结果为'lvLxz' ... 这里没什么技巧,混淆就是这样,考验的耐心 */ //最后发现了核心代码
let __jsl_clearance_s=_0x38ba7c[0] //容易发现_0x38ba7c这是一个数组,搜索寻找其定位的地方
var _0x38ba7c = _0x213920(_0x4af155['ct'], _0x4af155['bts']); //方法里面传了2个参数,先参数后方法,控制台还原参数
<br>
源代码搜索_0x4af155,没有找到其赋值的地方,估计中途换了马甲,上栈台<br>
<br>
<br>
好嘛,原来是原住名,先暂时固定住<br>
var _0x4af155 = { "bts": ["1684211908.462|0|864", "4kitzYBW4tT9ZMt6O%2BYZMo%3D"], "chars": "hCLxUnaKoldwolXHfOYoAC", "ct": "9039b0cf78bae628fe5e482dd5048e35d34fa598", "ha": "sha1", "tn": "__jsl_clearance_s", "vt": "3600", "wt": "1500" }
接下来来看看这个方法做了什么
逆向加密方法逻辑
<br>
//点击_0x213920进入方法内部,不多,直接复制下来 function _0x213920(_0x4166ab, _0x319b42) { var _0x31fe7f = _0x4af155[_0x49d1('0x4c', '6yLo') + 's'][_0x49d1('0x95', '!dKZ') + 'th']; for (var _0x282885 = 0x0; _0x282885 < _0x31fe7f; _0x282885++) { for (var _0x494272 = 0x0; _0xa88405[_0x49d1('0x2e', 'x*XD') + 'T'](_0x494272, _0x31fe7f); _0x494272++) { var _0x587cf2 = _0xa88405[_0x49d1('0x4e', '4SZE') + 'I'](_0x319b42[0x0] + _0x4af155[_0x49d1('0x7e', '#HAO') + 's'][_0x49d1('0x15', 'DN7@') + 'tr'](_0x282885, 0x1) + _0x4af155[_0x49d1('0x58', 'UOyd') + 's'][_0x49d1('0x51', 'P(7z') + 'tr'](_0x494272, 0x1), _0x319b42[0x1]); if (_0xa88405[_0x49d1('0x5c', 'DjgR') + 'W'](hash(_0x587cf2), _0x4166ab)) { return [_0x587cf2, new Date() - _0x2d647a]; } } } } //又到了考验耐心的时刻,控制台还原一下 function _0x213920(_0x4166ab, _0x319b42) { var _0x31fe7f = _0x4af155['chars']['length']; for (var _0x282885 = 0; _0x282885 < _0x31fe7f; _0x282885++) { for (var _0x494272 = 0; _0x494272 < _0x31fe7f; _0x494272++) { var _0x587cf2 = (_0x319b42[0] + _0x4af155['chars']['substr'](_0x282885, 1) + _0x4af155['chars']['substr'](_0x494272, 1)) + _0x319b42[1]; if (hash(_0x587cf2) === _0x4166ab) { return _0x587cf2; } } } } /* 这里除了hash这个方法未能还原出来,其余均还原出来了,简单说下逻辑 _0x4166ab参数是_0x4af155参数中的'ct'属性,拿上面的例子就是"9039b0cf78bae628fe5e482dd5048e35d34fa598" _0x319b42参数是_0x4af155参数中的'bts'属性,拿上面的例子就是["1684211908.462|0|864", "4kitzYBW4tT9ZMt6O%2BYZMo%3D"], 第一行将_0x4af155参数中的'chars'属性的长度赋值给_0x31fe7f 接下来定义了2个for循环,起点都是0,步增1,条件是小于值_0x31fe7f 循环内部是这么一个逻辑 1. 将bts列表的第一个值拼接_0x4af155中'chars'在当前2个循环指针指向的那个字符(就是再➕2个字符),拼接bts列表的第二个值,赋值给_0x587cf2 2. 如果_0x587cf2经过了名为hash的处理(大概率是分组哈希加密,就md5、sha那些),如果其值等于第一个参数也就是_0x4af155参数中的'ct'属性值,那么返回_0x587cf2,也就是最终要的cookie值 那么到底是哪一种hash算法呢(先假定没有魔改,经验概率较小),在_0x4af155变量中找到了可疑点"ha": "sha1",那就先假定是sha1 搞清楚了逻辑,就在本地node.js环境中做验证 */
验证加密逻辑能否跑通
代码如下:
let crypto = require('crypto-js') function hash(method, data) { switch (method) { case 'sha1': return crypto.SHA1(data).toString(); default: throw "未记录的method: " + method } } function _0x213920(_0x4166ab, _0x319b42) { var _0x31fe7f = _0x4af155['chars']['length']; for (var _0x282885 = 0; _0x282885 < _0x31fe7f; _0x282885++) { for (var _0x494272 = 0; _0x494272 < _0x31fe7f; _0x494272++) { var _0x587cf2 = (_0x319b42[0] + _0x4af155['chars']['substr'](_0x282885, 1) + _0x4af155['chars']['substr'](_0x494272, 1)) + _0x319b42[1]; if (hash(_0x4af155['ha'], _0x587cf2) === _0x4166ab) { return _0x587cf2; } } } } function encrypt_cookie(_0x4af155) { global._0x4af155 = _0x4af155; //这是因为_0x213920方法中也用到了这个变量,因此挂在全局上 return _0x213920(_0x4af155['ct'], _0x4af155['bts']) } let _0x4af155 = { "bts": ["1684211908.462|0|864", "4kitzYBW4tT9ZMt6O%2BYZMo%3D"], "chars": "hCLxUnaKoldwolXHfOYoAC", "ct": "9039b0cf78bae628fe5e482dd5048e35d34fa598", "ha": "sha1", "tn": "__jsl_clearance_s", "vt": "3600", "wt": "1500" } console.log(encrypt_cookie(_0x4af155));

这证明假设正确用python完整的模拟一次请求后发现有时成功有时不成功,估计这是因为动态html返回的参数中
"ha": "sha1",
,并不是每次都是sha1,经过多次尝试,发现主要是这三种md5,sha1,sha256
,那么只需要简单的做个动态逻辑判断就好了至此,整个逆向过程完成,再次梳理下思路
- 一共3次请求,2次521,第三次才是200,cookie加密2处:
__jsluid_s和__jsl_clearance_s
(加速乐的显著特征) - __jsluid_s来源于第一次response的set-cookie
- __jsl_clearance_s来源于第二次response.body中的js计算,而构造第二次请求所需cookie来源于第一次请求中response.set-cookie,和response.body中的js脚本(
简单混淆) - 其中__jsl_clearance_s经过了2次变换,第二次变换的主要特点是for循环和hash(随机)加密
- 一共3次请求,2次521,第三次才是200,cookie加密2处:
三、代码
python
import json
import time
import requests
import execjs
from jmespath import search
from loguru import logger
import pymongo
from redis import Redis
class Spider():
def __init__(self):
self.mongodb = pymongo.MongoClient()
self.db = self.mongodb['spider']
self.coll = self.db['jsl']
self.redis_cli = Redis()
with open('emu_enc.js') as f:
self.js = f.read()
def req1(self):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
}
url = "https://credit.hefei.gov.cn/credit-website/publicity/search-result/xyxx-list.do"
response = requests.get(url, headers=headers)
__jsl_clearance_s_str = response.text.split(
'''<script>document.cookie=('_')+('_')+('j')+('s')+('l')+('_')+('c')+('l')+('e')+('a')+('r')+('a')+('n')+('c')+('e')+('_')+('s')+('=')''',
1)[-1].rsplit('''+(';')+('m')+('a')+('x')''', 1)[0]
__jsl_clearance_s = execjs.eval(
__jsl_clearance_s_str
)
ret_data = {
'__jsl_clearance_s': __jsl_clearance_s,
'__jsluid_s': response.cookies.get_dict()['__jsluid_s']
}
return ret_data
def req2(self, cookie):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
}
url = "https://credit.hefei.gov.cn/credit-website/publicity/search-result/xyxx-list.do"
response = requests.post(url, headers=headers, cookies=cookie)
data = response.text.rsplit('go(', 1)[-1].split(')</script>', 1)[0]
__jsl_clearance_s = execjs.compile(self.js).call('encrypt_cookie', json.loads(data))
cookie.update({'__jsl_clearance_s': __jsl_clearance_s}) # 这个cookie大约1h过期
return cookie
def req3(self, cookie, page):
url = 'https://credit.hefei.gov.cn/credit-website/publicity/topSearch/getXYXXPageInfo.do'
headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
'Referer': 'https://credit.hefei.gov.cn/credit-website/publicity/search-result/xyxx-list.do',
}
logger.debug('req:' + url)
data = {
'currentPageNo': page, # 这是当前的页数,要求2000数据+,来个20页,就满足了
'pageSize': '100', # 此处发现可以修改成和网页不一样的参数
'creditQuery': ''
}
response = requests.post(url, headers=headers, cookies=cookie, data=data)
assert response.status_code == 200
return response
def parse(self, response):
data = response.json()
items = search('rows', data)
data2save = []
for item in items:
title = search('baseDwmc', item)
credential_code = search('uniscid', item)
register_code = search('baseZch', item)
data2save.append({
"title": title,
"credential_code": credential_code,
"register_code": register_code
})
logger.debug('data2save: ' + json.dumps(data2save, ensure_ascii=False, indent=2))
return data2save
def save(self, data):
"""用redis set特性过滤避免重复插入,sadd如果有则返回1,否则是0"""
for i in data:
if self.redis_cli.sadd('jsl', json.dumps(i)):
self.coll.insert_one(i)
logger.debug('保存成功!')
def main(self):
cookie = self.req1()
cookie2 = self.req2(cookie)
for i in range(1, 2):
res = self.req3(cookie2, i)
data2save = self.parse(res)
self.save(data2save)
time.sleep(3) # 文明爬虫,从我做起
if __name__ == '__main__':
spider = Spider()
spider.main()
js
let crypto = require('crypto-js')
function hash(method, data) {
switch (method) {
case 'sha1':
return crypto.SHA1(data).toString();
case 'md5':
return crypto.MD5(data).toString();
case 'sha256':
return crypto.SHA256(data).toString();
default:
throw "未记录的method: " + method
}
}
function _0x213920(_0x4166ab, _0x319b42) {
var _0x31fe7f = _0x4af155['chars']['length'];
for (var _0x282885 = 0; _0x282885 < _0x31fe7f; _0x282885++) {
for (var _0x494272 = 0; _0x494272 < _0x31fe7f; _0x494272++) {
var _0x587cf2 = (_0x319b42[0] + _0x4af155['chars']['substr'](_0x282885, 1) + _0x4af155['chars']['substr'](_0x494272, 1)) + _0x319b42[1];
if (hash(_0x4af155['ha'], _0x587cf2) === _0x4166ab) {
return _0x587cf2;
}
}
}
}
function encrypt_cookie(_0x4af155) {
global._0x4af155 = _0x4af155; //这是因为_0x213920方法中也用到了这个变量,因此挂在全局上
return _0x213920(_0x4af155['ct'], _0x4af155['bts'])
}
转载请注明来源,欢迎对文章中的引用来源进行考证,欢迎指出任何有错误或不够清晰的表达,如有问题请邮件至2454612285@qq.com。