example04.py 837 B

12345678910111213141516171819202122232425262728293031
  1. from urllib.parse import urljoin
  2. import re
  3. import requests
  4. from bs4 import BeautifulSoup
  5. def main():
  6. headers = {'user-agent': 'Baiduspider'}
  7. proxies = {
  8. 'http': 'http://122.114.31.177:808'
  9. }
  10. base_url = 'https://www.zhihu.com/'
  11. seed_url = urljoin(base_url, 'explore')
  12. resp = requests.get(seed_url,
  13. headers=headers,
  14. proxies=proxies)
  15. soup = BeautifulSoup(resp.text, 'lxml')
  16. href_regex = re.compile(r'^/question')
  17. link_set = set()
  18. for a_tag in soup.find_all('a', {'href': href_regex}):
  19. if 'href' in a_tag.attrs:
  20. href = a_tag.attrs['href']
  21. full_url = urljoin(base_url, href)
  22. link_set.add(full_url)
  23. print('Total %d question pages found.' % len(link_set))
  24. if __name__ == '__main__':
  25. main()