example05.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. from urllib.error import URLError
  2. from urllib.request import urlopen
  3. import re
  4. import redis
  5. import ssl
  6. import hashlib
  7. import logging
  8. import pickle
  9. import zlib
  10. # Redis有两种持久化方案
  11. # 1. RDB
  12. # 2. AOF
  13. # 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
  14. def decode_page(page_bytes, charsets=('utf-8',)):
  15. page_html = None
  16. for charset in charsets:
  17. try:
  18. page_html = page_bytes.decode(charset)
  19. break
  20. except UnicodeDecodeError:
  21. pass
  22. # logging.error('[Decode]', err)
  23. return page_html
  24. # 获取页面的HTML代码(通过递归实现指定次数的重试操作)
  25. def get_page_html(seed_url, *, retry_times=3, charsets=('utf-8',)):
  26. page_html = None
  27. try:
  28. if seed_url.startswith('http://') or \
  29. seed_url.startswith('https://'):
  30. page_html = decode_page(urlopen(seed_url).read(), charsets)
  31. except URLError as err:
  32. logging.error('[URL]', err)
  33. if retry_times > 0:
  34. return get_page_html(seed_url, retry_times=retry_times - 1,
  35. charsets=charsets)
  36. return page_html
  37. # 从页面中提取需要的部分(通常是链接也可以通过正则表达式进行指定)
  38. def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I):
  39. pattern_regex = re.compile(pattern_str, pattern_ignore_case)
  40. return pattern_regex.findall(page_html) if page_html else []
  41. # 开始执行爬虫程序
  42. def start_crawl(seed_url, match_pattern, *, max_depth=-1):
  43. client = redis.Redis(host='1.2.3.4', port=6379, password='1qaz2wsx')
  44. charsets = ('utf-8', 'gbk', 'gb2312')
  45. logging.info('[Redis ping]', client.ping())
  46. url_list = [seed_url]
  47. visited_url_list = {seed_url: 0}
  48. while url_list:
  49. current_url = url_list.pop(0)
  50. depth = visited_url_list[current_url]
  51. if depth != max_depth:
  52. page_html = get_page_html(current_url, charsets=charsets)
  53. links_list = get_matched_parts(page_html, match_pattern)
  54. for link in links_list:
  55. if link not in visited_url_list:
  56. visited_url_list[link] = depth + 1
  57. page_html = get_page_html(link, charsets=charsets)
  58. if page_html:
  59. hasher = hashlib.md5()
  60. hasher.update(link.encode('utf-8'))
  61. zipped_page = zlib.compress(pickle.dumps(page_html))
  62. client.set(hasher.hexdigest(), zipped_page)
  63. def main():
  64. ssl._create_default_https_context = ssl._create_unverified_context
  65. start_crawl('http://sports.sohu.com/nba_a.shtml',
  66. r'<a[^>]+test=a\s[^>]*href=["\'](.*?)["\']',
  67. max_depth=2)
  68. if __name__ == '__main__':
  69. main()