Files
  • main.py
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import re

def download(url, user_agent='ci3030', num_retries=2, charset='utf-8'):
	print("Downloading: ",url)
	request = urllib.request.Request(url)
	request.add_header('User-agent', user_agent)
	try:
		rep=urllib.request.Request(url)
		cs=resp.headers.get_content_charset()
		if not cs:
			cs=charset
		html=resp.read().decode(cs)
	except(URLError, HTTPError, ContentTooShortError):
		print('Download Error ',e.reason())
		html=None
		if num_retries>0:
			if hasattr(e,'code')and 500<=e.code<600:
				return download(url, num_retries-1)
	return html

def crawl_sitemap(url):
	sitemap=download(url)