跳至内容
xlongwei
用户工具
登录
站点工具
工具
显示页面
修订记录
反向链接
最近更改
媒体管理器
网站地图
登录
最近更改
媒体管理器
网站地图
您在这里:
start
»
web
»
crawler.district
web:crawler.district
本页面只读。您可以查看源文件,但不能更改它。如果您觉得这是系统错误,请联系管理员。
* 数据设计:province(code,name) city(code,name) county(code,name) town(code,name) village(code,name,type) <code> provinces:{'51':{'name':'四川省'}} province51:{'5116':{'name':'广安市'}} city5116:{'511623':{'name':'邻水县'}} county511623:{'511623113':{'name':'丰禾镇'}} town511623113:{'002':{'name':'新华社区居委会','type':'121'}} 城乡分类代码(1城镇2乡村):111主城区,112城乡结合区,121镇中心区,122镇乡结合区,123特殊区域;210乡中心区,220村庄 province(code=51,name=四川省) city(code=5116,name=广安市) county(code=511623,name=邻水县) town(code=511623113,name=丰禾镇) village(code=511623113002,name=新华社区居委会,type=121) </code> * 数据处理:关闭日志jdbc.sqlonly和IndexHandler,province=31.0 city=342.0 county=2990.0 town=41513.0 village=632091.0 <code> provinces=util.hgetall("provinces") log.info("provinces={}",provinces.size()) counts={} provinces.entrySet() //.stream().limit(1) .forEach(function(provinceEntry){ provinceCode=provinceEntry.getKey() provinceName=util.get(provinceEntry.getValue(),"name") log.info("province(code={},name={})",provinceCode,provinceName) util.insert("insert ignore province values(?,?)",provinceCode,provinceName) cities=util.hgetall("province"+provinceCode) log.info("province={} cities={}",provinceName,cities.size()) counts.province=!counts.province?1:counts.province+1 cities.entrySet() //.stream().limit(1) .forEach(function(cityEntry){ cityCode=cityEntry.getKey() cityName=util.get(cityEntry.getValue(),"name") log.info("city(code={},name={})",cityCode,cityName) util.insert("insert ignore city values(?,?)",cityCode,cityName) counties=util.hgetall("city"+cityCode) log.info("city={} counties={}",cityName,counties.size()) counts.city=!counts.city?1:counts.city+1 counties.entrySet() //.stream().limit(1) .forEach(function(countyEntry){ countyCode=countyEntry.getKey() countyName=util.get(countyEntry.getValue(),"name") log.info("county(code={},name={})",countyCode,countyName) util.insert("insert ignore county values(?,?)",countyCode,countyName) towns=util.hgetall("county"+countyCode) log.info("county={} towns={}",countyName,towns.size()) counts.county=!counts.county?1:counts.county+1 towns.entrySet() //.stream().limit(1) .forEach(function(townEntry){ townCode=townEntry.getKey() townName=util.get(townEntry.getValue(),"name") //log.info("town(code={},name={})",townCode,townName) util.insert("insert ignore town values(?,?)",townCode,townName) villages=util.hgetall("town"+townCode) log.info("town={} villages={}",townName,villages.size()) counts.town=!counts.town?1:counts.town+1 villages.entrySet() //.stream().limit(1) .forEach(function(villageEntry){ villageCode=villageEntry.getKey(); villageName=util.get(villageEntry.getValue(),"name") villageType=util.get(villageEntry.getValue(),"type") log.info("village(code={},name={},type={})",townCode+villageCode,villageName,villageType) counts.village=!counts.village?1:counts.village+1 util.insert("insert ignore village values(?,?,?)",townCode+villageCode,villageName,villageType) }) //villages }) //towns }) //counties }) //cities }) //provinces log.warn("province={} city={} county={} town={} village={}",counts.province,counts.city,counts.county,counts.town,counts.village) </code> * level1,获取一级列表,http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html <code> urls=[] data={} prefix=url.substring(url.indexOf(':')+1,url.lastIndexOf('/')+1) $('tr.provincetr td a').each(function(idx,a){ href=$(this).attr('href') urls.push(prefix+href) code=href.substring(0,href.lastIndexOf('.')) name=$(this).text().trim() data['hset_provinces_'+code]={'name':name} }) crawl = {'urls':urls,data:data} </code> * level2,http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/51.html <code> urls=[] data={} prefix=url.substring(url.indexOf(':')+1,url.lastIndexOf('/')+1) province=url.substring(url.lastIndexOf('/')+1,url.lastIndexOf('.')) $('tr.citytr td:nth-child(2) a').each(function(idx,a){ href=$(this).attr('href') urls.push(prefix+href) code=href.substring(href.indexOf('/')+1,href.lastIndexOf('.')) name=$(this).text().trim() data['hset_province'+province+'_'+code]={'name':name} }) crawl = {'urls':urls,data:data} </code> * level3,http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/51/5116.html <code> urls=[] data={} prefix=url.substring(url.indexOf(':')+1,url.lastIndexOf('/')+1) city=url.substring(url.lastIndexOf('/')+1,url.lastIndexOf('.')) $('tr.countytr td:nth-child(2) a').each(function(idx,a){ href=$(this).attr('href') urls.push(prefix+href) code=href.substring(href.indexOf('/')+1,href.lastIndexOf('.')) name=$(this).text().trim() data['hset_city'+city+'_'+code]={'name':name} }) crawl = {'urls':urls,data:data} </code> * level4,http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/51/16/511623.html <code> urls=[] data={} prefix=url.substring(url.indexOf(':')+1,url.lastIndexOf('/')+1) county=url.substring(url.lastIndexOf('/')+1,url.lastIndexOf('.')) $('tr.towntr td:nth-child(2) a').each(function(idx,a){ href=$(this).attr('href') urls.push(prefix+href) code=href.substring(href.indexOf('/')+1,href.lastIndexOf('.')) name=$(this).text().trim() data['hset_county'+county+'_'+code]={'name':name} }) crawl = {'urls':urls,data:data} </code> * level5,http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/51/16/23/511623113.html <code> urls=[] data={} prefix=url.substring(url.indexOf(':')+1,url.lastIndexOf('/')+1) town=url.substring(url.lastIndexOf('/')+1,url.lastIndexOf('.')) $('tr.villagetr').each(function(idx,a){ td=$(this).children() code=td.first().text().trim().substring(town.length) type=td.eq(1).text().trim() name=td.last().text().trim() data['hset_town'+town+'_'+code]={'name':name,'type':type} }) crawl = {'urls':urls,data:data} </code>
web/crawler.district.txt
· 最后更改: 2021/03/26 18:47 由
admin
页面工具
显示页面
修订记录
反向链接
回到顶部