用户工具

站点工具


web:crawler.idcard

crawler

  • 数据设计:idcard:{code:{name,year}},总计6734条,counts={“1980”:5,“1981”:115,“1982”:553,“1983”:111,“1984”:155,“1985”:120,“1986”:75,“1987”:163,“1988”:30,“1989”:24,“1990”:23,“1991”:97,“1992”:187,“1993”:198,“1994”:305,“1995”:120,“1996”:146,“1997”:80,“1998”:93,“1999”:229,“2000”:88,“2001”:121,“2002”:95,“2003”:20,“2004”:4,“2005”:9,“2006”:2,“2008”:5,“2009”:6,“2010”:34,“2011”:13,“2012”:24,“2013”:60,“2014”:56,“2015”:55,“2016”:25,“2017”:29,“2018”:32,“2019”:18,“2020”:3209,“max”:246,“null”:1}
idcards=util.hgetall("idcards")
log.info('idcards={}',idcards.size())
counts={}
idcards.entrySet()
//.stream().limit(1)
.forEach(function(idcard){
    code=idcard.getKey()
    name=util.get(idcard.getValue(),"name")
    year=util.get(idcard.getValue(),"year")
    //log.info('code={} name={} year={}',code,name,year)
    util.insert("insert ignore idcard values(?,?,?)",code,name,year)
    counts[year]=!counts[year]?1:counts[year]+1
    counts.max=!counts.max||name.length>counts.max?name.length:counts.max
})
log.info('counts={}',JSON.stringify(counts));
urls=[]
data={}
urls.push('http://www.mca.gov.cn/article/sj/xzqh/1980/')
urls.push('http://www.mca.gov.cn/article/sj/xzqh/1980/?2')
urls.push('http://www.mca.gov.cn/article/sj/xzqh/1980/?3')
crawl = {'urls':urls,data:data}
urls=[]
data={}
prefix='http://www.mca.gov.cn'
$('td.arlisttd a').each(function(idx,a){
  href=$(this).attr('href')
  urls.push(prefix+href)
})
crawl = {'urls':urls,data:data}
urls=[]
data={}
$('#zoom a').each(function(idx,a){
  href=$(this).attr('href')
  if($(this).text().indexOf('代码')>0) urls.push(href)
})
crawl = {'urls':urls,data:data}
urls=[]
data={}
data.rows=0
reg=/window.location.href="(.*)"/
if(reg.test(html)){ urls.push(reg.exec(html)[1]) }
$('tr').each(function(idx,a){
txt=$(this).text().trim()
if(/\d{4}年/.test(txt)){ pos=txt.indexOf('年'); data.year=txt.substring(pos-4,pos) }
else if(/\d{6}/.test(txt)) { arr=/(\d{6})(.*)/.exec(txt); code=arr[1]; name=txt.substring(txt.indexOf(code)+6).trim(); if(name) { 
data['hsetnx_idcards_'+code]={name:name,year:data.year}; 
data.rows++ } }
})
crawl = {'urls':urls,data:data}
urls=[]
data={}
data.rows=0
$('tr').each(function(idx,a){
txt=$(this).text().trim()
if(/\d{4}年/.test(txt)){ pos=txt.indexOf('年'); data.year=txt.substring(pos-4,pos) }
else if(/\d{6}/.test(txt)) { arr=/(\d{6})(.*)/.exec(txt); code=arr[1]; name=txt.substring(txt.indexOf(code)+6).trim(); if(name) { 
data['hsetnx_idcards_'+code]={name:name,year:data.year}; 
data.rows++ } }
})
crawl = {'urls':urls,data:data}
web/crawler.idcard.txt · 最后更改: 2022/01/12 15:21 由 admin