import json from time import sleep from bs4 import BeautifulSoup from requests import request all_res = {} def get_village(url: "str", direct: "bool" = True): res, town = {}, url[-14:-5] # town: 9L resp = request("get", url, timeout=2) dom = BeautifulSoup(resp.content.decode("utf-8"), "lxml") targets = dom.select("tr.villagetr") size = len(targets) assert size > 0 for village in targets: infos = village.select("td") res[infos[0].text] = {"type": infos[1].text, "name": infos[2].text} print(f"|----{size} villages") if not direct: return res city, county = town[0:4], town[0:6] if city not in all_res.keys(): all_res[city] = {} if county not in all_res[city].keys(): all_res[city][county] = {} all_res[city][county][town] = res def get_town(url: "str"): res, county = {}, url[-11:-5] # cid: 6L resp = request("get", url, timeout=2) dom = BeautifulSoup(resp.content.decode("utf-8"), "lxml") targets = dom.select("tr.towntr > td:last-child > a") size = len(targets) assert size > 0 for i, town in enumerate(targets): name, link = town.text, town["href"] # link: 2/9.html print(f"|--town [{i + 1:02}/{size:02}] from {url[:-11]}/{link}") res[link[3:12]] = {"name": name, "children": get_village(f"{url[:-11]}/{link}", False)} sleep(1) city = county[0:4] if city not in all_res.keys(): all_res[city] = {} all_res[city][county] = res def get_county(url: "str"): res, city = {}, url[-9:-5] # city: 4L resp = request("get", url, timeout=2) dom = BeautifulSoup(resp.content.decode("utf-8"), "lxml") targets = dom.select("tr.towntr > td:last-child > a") size = len(targets) assert size > 0 for i, town in enumerate(targets): name, link = town.text, town["href"] # link: 2/9.html print(f"|--county [{i + 1:02}/{size:02}] from {url[:-9]}/{link}") res[link[3:12]] = {"name": name, "children": get_village(f"{url[:-9]}/{link}", False)} sleep(1) all_res[city] = res def main(): with open("error.json", "r", encoding="utf-8") as fp: data = json.load(fp) for item in data: if item["type"] == "county": get_county(item["url"]) elif item["type"] == "town": get_town(item["url"]) else: get_village(item["url"]) with open("deal.json", "w", encoding="utf-8") as fp: json.dump(all_res, fp, ensure_ascii=False, indent=2) if __name__ == '__main__': main() """ type enum('111', '112', '121', '122', '123', '210', '220') level enum('A1+', 'A2+', 'A3+', 'A4+', 'A5-', 'B1+', 'B2-') """