-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathTownCodeScraper.py
More file actions
146 lines (116 loc) · 4.66 KB
/
TownCodeScraper.py
File metadata and controls
146 lines (116 loc) · 4.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import json
import re
from urllib.parse import urlparse, parse_qs
import requests
from bs4 import BeautifulSoup
from sort_town_list import write_json
town_without_page_list = []
def search_cadastral_code_in_pages(page_list):
output_list = []
for url in page_list:
if "redlink" in url:
params = parse_qs(urlparse(url).query)
if "title" in params:
town_title = params['title'][0].replace("_", " ").replace("%27", "'")
town_without_page_list.append(town_title)
else:
resp = requests.get(url=url)
soup = BeautifulSoup(resp.content, 'html.parser')
td = soup.select('th')
town_name = td[0].contents[0]
for name, val in zip(td, td[1:]):
a = name.find_next('a')
if a.has_attr('title') and "catastale" in a['title']:
cadastral = name.parent.find_next('td').text.replace('\n', '')
province_node = name.parent.find_next('tr').find('td')
province = province_node.text.replace('\n', '') if province_node is not None else "-"
province = province if len(province) == 2 else "-"
print(town_name, cadastral, province)
output_list.append({
"cc": cadastral,
"id": town_name,
"p": province
})
break
return output_list
def search_cadastral_code_of_town(town_name):
response = requests.get("https://calcolocf.com/codice-catastale.html?comune={}".format(town_name))
soup = BeautifulSoup(response.content, 'html.parser')
cadastral_code = soup.find("div", {"class": "codice_catastale"})
town_code = soup.find("a", href=re.compile("comune="))
if cadastral_code is not None:
return cadastral_code.text, \
town_code.text.replace(town_name, "").replace(" ", "").replace("(", "").replace(")", "")
return "", ""
def wikipedia_scraper():
output_list = []
min_year = 1930
base_url = "https://it.wikipedia.org"
soup = BeautifulSoup(open("comunisoppressi.html"), "html.parser")
li_list = soup.find_all('li')
for li in li_list:
links = li.findAll('a', href=True)
source_town_pages = []
target_town_pages = []
i = 0
while i < len(links) and not links[i]['title'].isnumeric():
source_town_pages.append(base_url + links[i]['href'])
i = i + 1
if i < len(links):
year = links[i]['title']
if int(year) >= min_year:
i = i + 1
while i < len(links) and "cite_note" not in links[i]['href'] and not links[i]['title'].isnumeric():
target_town_pages.append(base_url + links[i]['href'])
i = i + 1
output_list = output_list + search_cadastral_code_in_pages(source_town_pages) + search_cadastral_code_in_pages(target_town_pages)
write_json(output_list, "soppressi")
def lombardia_scraper():
output_list = []
soup = BeautifulSoup(open("lombardia.html"), "html.parser")
li_list = soup.find_all('li')
for li in li_list:
links = li.findAll('a', href=True)
output_list.append(links[0]['title'])
return output_list
def lombardia_xxi_scraper():
output_list = []
soup = BeautifulSoup(open("lombardia_xxi.html"), "html.parser")
td_list = soup.find_all('td')
for td in td_list:
links = td.findAll('a', href=True)
if len(links) == 0 or links[0] is None:
continue
town_title = links[0]['title'].split(" (")[0]
output_list.append(town_title) if not town_title.isnumeric() else None
return output_list
def print_to_json(output_list):
with open('files/lombardia_soppressi.json', 'w') as output:
json.dump(output_list, output)
def scrape_wikipedia():
wikipedia_scraper()
print(town_without_page_list)
result_list = []
for town in town_without_page_list:
cadastral, province = search_cadastral_code_of_town(town)
result_list.append({
"cc": cadastral,
"id": town,
"p": province
})
print_to_json(result_list)
def main():
result_list = []
town_list = set(lombardia_scraper() + lombardia_xxi_scraper())
for town in town_list:
town = town.replace("Ca'", "Ca")
cadastral, province = search_cadastral_code_of_town(town)
print(cadastral)
result_list.append({
"cc": cadastral,
"id": town,
"p": province
})
print_to_json(result_list)
if __name__ == '__main__':
main()