用户工具

站点工具


web:crawler.district
  • 数据设计:province(code,name) city(code,name) county(code,name) town(code,name) village(code,name,type)
provinces:{'51':{'name':'四川省'}}
province51:{'5116':{'name':'广安市'}}
city5116:{'511623':{'name':'邻水县'}}
county511623:{'511623113':{'name':'丰禾镇'}}
town511623113:{'002':{'name':'新华社区居委会','type':'121'}}
城乡分类代码(1城镇2乡村):111主城区,112城乡结合区,121镇中心区,122镇乡结合区,123特殊区域;210乡中心区,220村庄
province(code=51,name=四川省) city(code=5116,name=广安市) county(code=511623,name=邻水县) town(code=511623113,name=丰禾镇) village(code=511623113002,name=新华社区居委会,type=121)
  • 数据处理:关闭日志jdbc.sqlonly和IndexHandler,province=31.0 city=342.0 county=2990.0 town=41513.0 village=632091.0
provinces=util.hgetall("provinces")
log.info("provinces={}",provinces.size())
counts={}
provinces.entrySet()
//.stream().limit(1)
.forEach(function(provinceEntry){
    provinceCode=provinceEntry.getKey()
    provinceName=util.get(provinceEntry.getValue(),"name")
    log.info("province(code={},name={})",provinceCode,provinceName)
    util.insert("insert ignore province values(?,?)",provinceCode,provinceName)
    cities=util.hgetall("province"+provinceCode)
    log.info("province={} cities={}",provinceName,cities.size())
counts.province=!counts.province?1:counts.province+1
    cities.entrySet()
    //.stream().limit(1)
    .forEach(function(cityEntry){
        cityCode=cityEntry.getKey()
        cityName=util.get(cityEntry.getValue(),"name")
        log.info("city(code={},name={})",cityCode,cityName)
        util.insert("insert ignore city values(?,?)",cityCode,cityName)
        counties=util.hgetall("city"+cityCode)
        log.info("city={} counties={}",cityName,counties.size())
counts.city=!counts.city?1:counts.city+1
        counties.entrySet()
        //.stream().limit(1)
        .forEach(function(countyEntry){
            countyCode=countyEntry.getKey()
            countyName=util.get(countyEntry.getValue(),"name")
             log.info("county(code={},name={})",countyCode,countyName)
             util.insert("insert ignore county values(?,?)",countyCode,countyName)
            towns=util.hgetall("county"+countyCode)
            log.info("county={} towns={}",countyName,towns.size())
counts.county=!counts.county?1:counts.county+1
            towns.entrySet()
            //.stream().limit(1)
            .forEach(function(townEntry){
                townCode=townEntry.getKey()
                townName=util.get(townEntry.getValue(),"name")
                 //log.info("town(code={},name={})",townCode,townName)
                 util.insert("insert ignore town values(?,?)",townCode,townName)
                villages=util.hgetall("town"+townCode)
                log.info("town={} villages={}",townName,villages.size())
counts.town=!counts.town?1:counts.town+1
                villages.entrySet()
                //.stream().limit(1)
                .forEach(function(villageEntry){
                    villageCode=villageEntry.getKey();
                    villageName=util.get(villageEntry.getValue(),"name")
                    villageType=util.get(villageEntry.getValue(),"type")
                    log.info("village(code={},name={},type={})",townCode+villageCode,villageName,villageType)
counts.village=!counts.village?1:counts.village+1
                    util.insert("insert ignore village values(?,?,?)",townCode+villageCode,villageName,villageType)
                }) //villages
            }) //towns
        }) //counties
    }) //cities
}) //provinces
log.warn("province={} city={} county={} town={} village={}",counts.province,counts.city,counts.county,counts.town,counts.village)
urls=[]
data={}
prefix=url.substring(url.indexOf(':')+1,url.lastIndexOf('/')+1)
$('tr.provincetr td a').each(function(idx,a){
  href=$(this).attr('href')
  urls.push(prefix+href)
  code=href.substring(0,href.lastIndexOf('.'))
  name=$(this).text().trim()
  data['hset_provinces_'+code]={'name':name}
})
crawl = {'urls':urls,data:data}
urls=[]
data={}
prefix=url.substring(url.indexOf(':')+1,url.lastIndexOf('/')+1)
province=url.substring(url.lastIndexOf('/')+1,url.lastIndexOf('.'))
$('tr.citytr td:nth-child(2) a').each(function(idx,a){
  href=$(this).attr('href')
  urls.push(prefix+href)
  code=href.substring(href.indexOf('/')+1,href.lastIndexOf('.'))
  name=$(this).text().trim()
  data['hset_province'+province+'_'+code]={'name':name}
})
crawl = {'urls':urls,data:data}
urls=[]
data={}
prefix=url.substring(url.indexOf(':')+1,url.lastIndexOf('/')+1)
city=url.substring(url.lastIndexOf('/')+1,url.lastIndexOf('.'))
$('tr.countytr td:nth-child(2) a').each(function(idx,a){
  href=$(this).attr('href')
  urls.push(prefix+href)
  code=href.substring(href.indexOf('/')+1,href.lastIndexOf('.'))
  name=$(this).text().trim()
  data['hset_city'+city+'_'+code]={'name':name}
})
crawl = {'urls':urls,data:data}
urls=[]
data={}
prefix=url.substring(url.indexOf(':')+1,url.lastIndexOf('/')+1)
county=url.substring(url.lastIndexOf('/')+1,url.lastIndexOf('.'))
$('tr.towntr td:nth-child(2) a').each(function(idx,a){
  href=$(this).attr('href')
  urls.push(prefix+href)
  code=href.substring(href.indexOf('/')+1,href.lastIndexOf('.'))
  name=$(this).text().trim()
  data['hset_county'+county+'_'+code]={'name':name}
})
crawl = {'urls':urls,data:data}
urls=[]
data={}
prefix=url.substring(url.indexOf(':')+1,url.lastIndexOf('/')+1)
town=url.substring(url.lastIndexOf('/')+1,url.lastIndexOf('.'))
$('tr.villagetr').each(function(idx,a){
  td=$(this).children()
  code=td.first().text().trim().substring(town.length)
  type=td.eq(1).text().trim()
  name=td.last().text().trim()
  data['hset_town'+town+'_'+code]={'name':name,'type':type}
})
crawl = {'urls':urls,data:data}
web/crawler.district.txt · 最后更改: 2021/03/26 18:47 由 admin