[{"descr":"车次信息","name":"train"} ,{"descr":"小阅读","name":"pdnovel"} ,{"descr":"妹子图","name":"mzitu"} ,{"descr":"类UI","name":"layui"} ,{"descr":"行政区划","name":"district"}]
脚本示例:url为html:+网址,html为文本,返回urls为下一级网址,data[key]=json为数据,key=hset_{map}_{field} hsetnx_{map}_{field}增量
demo1)示例:step=name单步+N宽度-2第二步开始'0'全部
cheerio语法:.attr text children nth-child(n) first eq(n) last find next each
urls=[] data={} prefix=url.substring(url.indexOf(':')+1,url.lastIndexOf('/')+1) province=url.substring(url.lastIndexOf('/')+1,url.lastIndexOf('.')) $('tr.citytr td:nth-child(2) a').each(function(idx,a){ href=$(this).attr('href') urls.push(prefix+href) code=href.substring(href.indexOf('/')+1,href.lastIndexOf('.')) name=$(this).text().trim() data['hset_province'+province+'_'+code]={'name':name} }) crawl = {'urls':urls,data:data}
数据处理:script
redis-cli -n 1 keys "property:city*"| xargs -tl -i redis-cli -n 1 del "{}" runStep('district','0') //测试时分析日志,成功后再清理数据,然后全量抓取 provinces=util.hgetall("provinces") provinces.entrySet() //.stream().limit(1) .forEach(function(provinceEntry){ provinceCode=provinceEntry.getKey() provinceName=util.get(provinceEntry.getValue(),"name") log.info("province(code={},name={})",provinceCode,provinceName) util.insert("insert ignore province values(?,?)",provinceCode,provinceName) })
pdnovel:小阅读获取
[{"descr":"获取分页","level":"1","name":"page","url":"http://bbs.xlongwei.com/pdnovel.php?mod=list"} ,{"descr":"获取书籍","level":"2","name":"novel","url":"http://bbs.xlongwei.com/pdnovel.php?mod=list"} ,{"descr":"获取章节","level":"3","name":"chapter","url":"http://bbs.xlongwei.com/pdnovel.php?mod=chapter&novelid=48"} ,{"descr":"获取内容","level":"4","name":"content","url":"http://bbs.xlongwei.com/pdnovel.php?mod=read&novelid=48&chapterid=15435"}]
trains:车次信息获取
[{"level":"1","name":"train","descr":"获取省份链接","url":"http://qq.ip138.com/train/"} ,{"level":"2","name":"province","descr":"获取城市链接","url":"http://qq.ip138.com/train/anhui/"} ,{"level":"3","name":"city","descr":"获取车次链接","url":"http://qq.ip138.com/train/anhui/AnQing.htm"} ,{"level":"4","name":"line","descr":"获取站台信息","url":"http://qq.ip138.com/train/D5601.htm"}]
mzitu:图片网址获取
[{"level":"1","name":"page","descr":"获取所有页码","url":"http://www.mzitu.com/"} ,{"level":"2","name":"img","descr":"获取单页图片","url":"https://www.mzitu.com/page/2/"} ,{"level":"3","name":"page2","descr":"获取大图分页","url":"https://www.mzitu.com/181419/"} ,{"level":"4","name":"img2","descr":"获取大图链接","url":"https://www.mzitu.com/181419/50"}]
district:全国行政区划2020
[{"descr":"获取一级列表","level":"1","name":"level1","url":"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html"} ,{"descr":"获取二级列表","level":"2","name":"level2","url":"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/51.html"} ,{"descr":"获取三级列表","level":"3","name":"level3","url":"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/51/5116.html"} ,{"descr":"获取四级列表","level":"4","name":"level4","url":"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/51/16/511623.html"} ,{"descr":"获取五级列表","level":"5","name":"level5","url":"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/51/16/23/511623113.html"}]
[{"descr":"分页","level":"1","name":"page","url":"http://www.mca.gov.cn/article/sj/xzqh/1980/"} ,{"descr":"子页","level":"2","name":"sub","url":"http://www.mca.gov.cn/article/sj/xzqh/1980/"} ,{"descr":"level3","level":"3","name":"level3","url":"http://www.mca.gov.cn/article/sj/xzqh/1980/202105/20210500033655.shtml"} ,{"descr":"level4","level":"4","name":"level4","url":"http://www.mca.gov.cn/article/sj/xzqh/2020/20201201.html"}]