levels.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. import json
  2. from time import sleep
  3. from random import random
  4. from requests import request
  5. from bs4 import BeautifulSoup
  6. from requests.exceptions import ReadTimeout
  7. total = {"province": 0, "city": 0, "county": 0, "town": 0, "village": 0}
  8. run = {"province": 0, "city": 0, "county": 0, "town": 0}
  9. cur = {"province": 0, "city": 0, "county": 0}
  10. errs = []
  11. def save(filename: "str", data: "dict or list"):
  12. with open(filename, "w", encoding="utf-8") as fp:
  13. json.dump(data, fp, ensure_ascii=False, indent=2)
  14. fp.close()
  15. def get_village_of(root: "str", url: "str") -> "dict": # root: 2022/2/2/2/9.html
  16. res, resp = {}, None
  17. try:
  18. resp = request("get", f"{root}/{url}", timeout=2)
  19. except ReadTimeout:
  20. errs.append({"type": "village", "reason": "timeout", "url": f"{root}/{url}"})
  21. print(f"village timeout")
  22. return {}
  23. dom = BeautifulSoup(resp.content.decode("utf-8"), "lxml")
  24. targets = dom.select("tr.villagetr")
  25. size = len(targets)
  26. if size == 0:
  27. errs.append({"type": "village", "reason": "zero", "url": f"{root}/{url}"})
  28. print(f"village zero")
  29. return {}
  30. total["village"] += size
  31. for village in targets:
  32. infos = village.select("td")
  33. res[infos[0].text] = {"type": infos[1].text, "name": infos[2].text}
  34. print(f"{size:02} villages")
  35. return res
  36. def get_town_of(root: "str", url: "str") -> "dict": # root: 2022/2/2/6.html
  37. sleep(random() + 0.5)
  38. res, resp = {}, None
  39. try:
  40. resp = request("get", f"{root}/{url}", timeout=2)
  41. except ReadTimeout:
  42. errs.append({"type": "town", "reason": "timeout", "url": f"{root}/{url}"})
  43. print(f"|------town timeout")
  44. return {}
  45. dom = BeautifulSoup(resp.content.decode("utf-8"), "lxml")
  46. targets = dom.select("tr.towntr > td:last-child > a")
  47. run["town"] = len(targets)
  48. if run["town"] == 0:
  49. errs.append({"type": "town", "reason": "zero", "url": f"{root}/{url}"})
  50. print(f"|------town zero")
  51. return {}
  52. total["town"] += run["town"]
  53. for cur["town"], town in enumerate(targets):
  54. name, link = town.text, town["href"] # link: 2/9.html
  55. print(
  56. f"|------town [{cur['town'] + 1:02}/{run['town']:02}]"
  57. f" county [{cur['county'] + 1:02}/{run['county']:02}]"
  58. f" city [{cur['city'] + 1:02}/{run['city']:02}]"
  59. f" province [{cur['province'] + 1:02}/{run['province']:02}]",
  60. end=" => "
  61. )
  62. res[link[3:12]] = {"name": name, "children": get_village_of(f"{root}/{url[0:2]}", link)}
  63. return res
  64. def get_county_of(root: "str", url: "str") -> "dict": # root: 2022/2/4.html
  65. res, resp = {}, None
  66. try:
  67. resp = request("get", f"{root}/{url}", timeout=2)
  68. except ReadTimeout:
  69. errs.append({"type": "county", "reason": "timeout", "url": f"{root}/{url}"})
  70. print(f"|----county timeout")
  71. return {}
  72. dom = BeautifulSoup(resp.content.decode("utf-8"), "lxml")
  73. targets = dom.select("tr.countytr > td:last-child > a")
  74. run["county"] = len(targets)
  75. if run["county"] == 0:
  76. errs.append({"type": "county", "reason": "zero", "url": f"{root}/{url}"})
  77. print(f"|----county zero")
  78. return {}
  79. total["county"] += run["county"]
  80. for cur["county"], county in enumerate(targets):
  81. name, link = county.text, county["href"] # link: 2/6.html
  82. print(
  83. f"|----county [{cur['county'] + 1:02}/{run['county']:02}]"
  84. f" city [{cur['city'] + 1:02}/{run['city']:02}]"
  85. f" province [{cur['province'] + 1:02}/{run['province']:02}]"
  86. )
  87. res[link[3:9]] = {"name": name, "children": get_town_of(f"{root}/{url[0:2]}", link)}
  88. sleep(2)
  89. return res
  90. def get_cities_of(root: "str", url: "str") -> "dict": # root: 2022/2.html
  91. res, resp = {}, None
  92. try:
  93. resp = request("get", f"{root}/{url}", timeout=2)
  94. except ReadTimeout:
  95. errs.append({"type": "city", "reason": "timeout", "url": f"{root}/{url}"})
  96. print(f"|--city timeout")
  97. return {}
  98. dom = BeautifulSoup(resp.content.decode("utf-8"), "lxml")
  99. targets = dom.select("tr.citytr > td:last-child > a")
  100. run["city"] = len(targets)
  101. if run["city"] == 0:
  102. errs.append({"type": "city", "reason": "zero", "url": f"{root}/{url}"})
  103. print(f"|--city zero")
  104. return {}
  105. total["city"] += run["city"]
  106. for cur["city"], city in enumerate(targets):
  107. name, link = city.text, city["href"] # link: 2/4.html
  108. print(
  109. f"|--city <{name}> [{cur['city'] + 1:02}/{run['city']:02}]"
  110. f" of province [{cur['province'] + 1:02}/{run['province']:02}]"
  111. )
  112. res[link[3:7]] = {"name": name, "children": get_county_of(root, link)}
  113. sleep(3)
  114. return res
  115. def get_provinces(root: "str") -> "dict":
  116. res = {}
  117. resp = request("get", f"{root}/index.html")
  118. dom = BeautifulSoup(resp.content.decode("utf-8"), "lxml")
  119. targets = dom.select("tr.provincetr a")
  120. run["province"] = len(targets)
  121. assert run["province"] > 0
  122. total["province"] += run["province"]
  123. for cur["province"], province in enumerate(targets):
  124. name, link = province.text, province["href"] # link: 2.html
  125. print(f"province <{name}> [{cur['province'] + 1:02}/{run['province']:02}] from {root}/{link}")
  126. res[link[0:2]] = {"name": name, "children": get_cities_of(root, link)}
  127. sleep(5)
  128. return res
  129. def main():
  130. root = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022"
  131. # back = "http://www.tcmap.com.cn/"
  132. res_file, total_file, err_file = "main.json", "total.json", "error.json"
  133. res = get_provinces(root)
  134. print(total)
  135. save(total_file, total)
  136. print(errs)
  137. save(err_file, errs)
  138. print(res)
  139. save(res_file, res)
  140. if __name__ == "__main__":
  141. """
  142. province, city, county, town, village
  143. 省、市、县、镇、村
  144. """
  145. main()