main.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. import json
  2. from time import sleep
  3. from bs4 import BeautifulSoup
  4. from requests import request
  5. all_res = {}
  6. def get_village(url: "str", direct: "bool" = True):
  7. res, town = {}, url[-14:-5] # town: 9L
  8. resp = request("get", url, timeout=2)
  9. dom = BeautifulSoup(resp.content.decode("utf-8"), "lxml")
  10. targets = dom.select("tr.villagetr")
  11. size = len(targets)
  12. assert size > 0
  13. for village in targets:
  14. infos = village.select("td")
  15. res[infos[0].text] = {"type": infos[1].text, "name": infos[2].text}
  16. print(f"|----{size} villages")
  17. if not direct:
  18. return res
  19. city, county = town[0:4], town[0:6]
  20. if city not in all_res.keys():
  21. all_res[city] = {}
  22. if county not in all_res[city].keys():
  23. all_res[city][county] = {}
  24. all_res[city][county][town] = res
  25. def get_town(url: "str"):
  26. res, county = {}, url[-11:-5] # cid: 6L
  27. resp = request("get", url, timeout=2)
  28. dom = BeautifulSoup(resp.content.decode("utf-8"), "lxml")
  29. targets = dom.select("tr.towntr > td:last-child > a")
  30. size = len(targets)
  31. assert size > 0
  32. for i, town in enumerate(targets):
  33. name, link = town.text, town["href"] # link: 2/9.html
  34. print(f"|--town [{i + 1:02}/{size:02}] from {url[:-11]}/{link}")
  35. res[link[3:12]] = {"name": name, "children": get_village(f"{url[:-11]}/{link}", False)}
  36. sleep(1)
  37. city = county[0:4]
  38. if city not in all_res.keys():
  39. all_res[city] = {}
  40. all_res[city][county] = res
  41. def get_county(url: "str"):
  42. res, city = {}, url[-9:-5] # city: 4L
  43. resp = request("get", url, timeout=2)
  44. dom = BeautifulSoup(resp.content.decode("utf-8"), "lxml")
  45. targets = dom.select("tr.towntr > td:last-child > a")
  46. size = len(targets)
  47. assert size > 0
  48. for i, town in enumerate(targets):
  49. name, link = town.text, town["href"] # link: 2/9.html
  50. print(f"|--county [{i + 1:02}/{size:02}] from {url[:-9]}/{link}")
  51. res[link[3:12]] = {"name": name, "children": get_village(f"{url[:-9]}/{link}", False)}
  52. sleep(1)
  53. all_res[city] = res
  54. def main():
  55. with open("error.json", "r", encoding="utf-8") as fp:
  56. data = json.load(fp)
  57. for item in data:
  58. if item["type"] == "county":
  59. get_county(item["url"])
  60. elif item["type"] == "town":
  61. get_town(item["url"])
  62. else:
  63. get_village(item["url"])
  64. with open("deal.json", "w", encoding="utf-8") as fp:
  65. json.dump(all_res, fp, ensure_ascii=False, indent=2)
  66. if __name__ == '__main__':
  67. main()
  68. """
  69. type enum('111', '112', '121', '122', '123', '210', '220')
  70. level enum('A1+', 'A2+', 'A3+', 'A4+', 'A5-', 'B1+', 'B2-')
  71. """