example06.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. from hashlib import sha1
  2. from urllib.parse import urljoin
  3. import pickle
  4. import re
  5. import requests
  6. import zlib
  7. from bs4 import BeautifulSoup
  8. from redis import Redis
  9. def main():
  10. # 指定种子页面
  11. base_url = 'https://www.zhihu.com/'
  12. seed_url = urljoin(base_url, 'explore')
  13. # 创建Redis客户端
  14. client = Redis(host='1.2.3.4', port=6379, password='1qaz2wsx')
  15. # 设置用户代理
  16. headers = {'user-agent': 'Baiduspider'}
  17. # 通过requests模块发送GET请求并指定用户代理
  18. resp = requests.get(seed_url, headers=headers)
  19. # 创建BeautifulSoup对象并指定使用lxml作为解析器
  20. soup = BeautifulSoup(resp.text, 'lxml')
  21. href_regex = re.compile(r'^/question')
  22. # 查找所有href属性以/question打头的a标签
  23. for a_tag in soup.find_all('a', {'href': href_regex}):
  24. # 获取a标签的href属性值并组装完整的URL
  25. href = a_tag.attrs['href']
  26. full_url = urljoin(base_url, href)
  27. # 将URL处理成SHA1摘要(长度固定更简短)
  28. hasher = sha1()
  29. hasher.update(full_url.encode('utf-8'))
  30. field_key = hasher.hexdigest()
  31. # 如果Redis的键'zhihu'对应的hash数据类型中没有URL的摘要就访问页面并缓存
  32. if not client.hexists('zhihu', field_key):
  33. html_page = requests.get(full_url, headers=headers).text
  34. # 对页面进行序列化和压缩操作
  35. zipped_page = zlib.compress(pickle.dumps(html_page))
  36. # 使用hash数据类型保存URL摘要及其对应的页面代码
  37. client.hset('zhihu', field_key, zipped_page)
  38. # 显示总共缓存了多少个页面
  39. print('Total %d question pages found.' % client.hlen('zhihu'))
  40. if __name__ == '__main__':
  41. main()