example01.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. from urllib.error import URLError
  2. from urllib.request import urlopen
  3. import re
  4. import pymysql
  5. def get_page_code(start_url, *, retry_times=3, charsets=('utf-8', )):
  6. try:
  7. for charset in charsets:
  8. try:
  9. html = urlopen(start_url).read().decode(charset)
  10. break
  11. except UnicodeDecodeError:
  12. html = None
  13. except URLError as ex:
  14. print('Error:', ex)
  15. return get_page_code(start_url, retry_times=retry_times - 1, charsets=charsets) if \
  16. retry_times > 0 else None
  17. return html
  18. def main():
  19. url_list = ['http://sports.sohu.com/nba_a.shtml']
  20. visited_list = set({})
  21. while len(url_list) > 0:
  22. current_url = url_list.pop(0)
  23. visited_list.add(current_url)
  24. print(current_url)
  25. html = get_page_code(current_url, charsets=('utf-8', 'gbk', 'gb2312'))
  26. if html:
  27. link_regex = re.compile(r'<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
  28. link_list = re.findall(link_regex, html)
  29. url_list += link_list
  30. conn = pymysql.connect(host='localhost', port=3306,
  31. db='crawler', user='root',
  32. passwd='123456', charset='utf8')
  33. try:
  34. for link in link_list:
  35. if link not in visited_list:
  36. visited_list.add(link)
  37. print(link)
  38. html = get_page_code(link, charsets=('utf-8', 'gbk', 'gb2312'))
  39. if html:
  40. title_regex = re.compile(r'<h1>(.*)<span', re.IGNORECASE)
  41. match_list = title_regex.findall(html)
  42. if len(match_list) > 0:
  43. title = match_list[0]
  44. with conn.cursor() as cursor:
  45. cursor.execute('insert into tb_result (rtitle, rurl) values (%s, %s)',
  46. (title, link))
  47. conn.commit()
  48. finally:
  49. conn.close()
  50. print('执行完成!')
  51. if __name__ == '__main__':
  52. main()