/
ukrstat.py
99 lines (88 loc) · 3.81 KB
/
ukrstat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# ukrstat.gov.ua crawling script
import requests
import time
import os
import zipfile
from bs4 import BeautifulSoup
from StringIO import StringIO
BASE_URL = "http://ukrstat.gov.ua/Noviny/"
BASE_DIR = "ukrstat"
# test script to check that links for each month are valid
"""
for i in ['%02d' % i for i in range(9,14)]:
for j in ['%02d' % j for j in range(1,13)]:
url = BASE_URL+"new20"+i+"/new20"+i+"_u/new_u"+j+".html"
r = requests.get(url)
time.sleep(1)
if r.status_code == 200:
print "Link for year", i, "and month", j, "is ok"
else:
print "Link for year", i, "and month", j, "is wrong"
"""
# all is ok, let's go to scraping
START_YEAR = 9 # in the current version of the site, there is no earlier 'news' than 2009
END_YEAR = 14 # change this to INCLUDE 2014 year in range
def main():
"""
Downloads files from ukrstat.gov.ua, unpacking zip files and creating directories for each month in range
"""
if not os.path.exists(BASE_DIR):
os.mkdir(BASE_DIR)
for i in ['%02d' % i for i in range(START_YEAR,END_YEAR)]:
for j in ['%02d' % j for j in range(1,13)]: # 1st to 12th months of the year
url = BASE_URL+"new20"+i+"/new20"+i+"_u/new_u"+j+".html"
r = requests.get(url)
time.sleep(1) # to not overload ukrstat page
fullpath = BASE_DIR+"/"+i+"/"+j+"/"
if not os.path.exists(fullpath):
os.makedirs(fullpath)
soup = BeautifulSoup(r.text)
links = [x.get('href') for x in soup.find_all('a')]
oplinks = [link for link in links if link and link.find("operativ/operativ") != -1]
oplinks = [link for link in oplinks if link.startswith('..')]
newlinks = [link.replace('../../..', 'http://ukrstat.gov.ua') for link in oplinks]
for link in newlinks:
filename = link[31:].replace('/','_')
page = requests.get(link)
if page.status_code != 404:
if filename.endswith(".zip"):
z = zipfile.ZipFile(StringIO(page.content))
z.extractall(fullpath)
print "Unpacking zipped file", filename, "to the directory"
else:
with open(fullpath+filename, "w") as f:
f.write(page.content)
print "Writing file", filename, "to the directory"
def testmain():
"""
Test script to use for only one month
"""
testdirpath = 'test/'
if not os.path.exists(testdirpath):
os.mkdir(testdirpath)
url = BASE_URL+"new2014/new2014_u/new_u04.html"
r = requests.get(url)
time.sleep(1) # to not overwhelm ukrstat.gov.ua
fullpath = testdirpath
if not os.path.exists(fullpath):
os.mkdir(fullpath)
soup = BeautifulSoup(r.text)
links = [x.get('href') for x in soup.find_all('a')]
oplinks = [link for link in links if link and link.find("operativ/operativ") != -1]
oplinks = [link for link in oplinks if link.startswith('..')]
newlinks = [link.replace('../../..', 'http://ukrstat.gov.ua') for link in oplinks]
for link in newlinks:
filename = link[31:].replace('/','_')
page = requests.get(link)
if page.status_code != 404:
if filename.endswith(".zip"):
z = zipfile.ZipFile(StringIO(page.content))
z.extractall(testdirpath)
print "Unpacking zipped file", filename, "to the directory"
else:
with open(testdirpath+filename, "w") as f:
f.write(page.content)
print "Writing file", filename, "to the directory"
if __name__ == "__main__":
# change main() to testmain() to use test version of the script
main()