example02.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. from bs4 import BeautifulSoup
  2. import re
  3. def main():
  4. html = """
  5. <!DOCTYPE html>
  6. <html lang="en">
  7. <head>
  8. <meta charset="UTF-8">
  9. <title>首页</title>
  10. </head>
  11. <body>
  12. <h1>Hello, world!</h1>
  13. <p>Good!!!</p>
  14. <hr>
  15. <div>
  16. <h2>这是一个例子程序</h2>
  17. <p>静夜思</p>
  18. <p class="foo">床前明月光</p>
  19. <p id="bar">疑似地上霜</p>
  20. <p class="foo">举头望明月</p>
  21. <div><a href="http://www.baidu.com"><p>低头思故乡</p></a></div>
  22. </div>
  23. <a class="foo" href="http://www.qq.com">腾讯网</a>
  24. <img src="./img/pretty-girl.png" alt="美女">
  25. <img src="./img/hellokitty.png" alt="凯蒂猫">
  26. <img src="./static/img/pretty-girl.png" alt="美女">
  27. <goup>Hello, Goup!</goup>
  28. </body>
  29. </html>
  30. """
  31. # resp = requests.get('http://sports.sohu.com/nba_a.shtml')
  32. # html = resp.content.decode('gbk')
  33. soup = BeautifulSoup(html, 'lxml')
  34. print(soup.title)
  35. # JavaScript: document.body.h1
  36. # JavaScript: document.forms[0]
  37. print(soup.body.h1)
  38. print(soup.find_all(re.compile(r'p$')))
  39. print(soup.find_all('img', {'src': re.compile(r'\./img/\w+.png')}))
  40. print(soup.find_all(lambda x: len(x.attrs) == 2))
  41. print(soup.find_all('p', {'class': 'foo'}))
  42. for elem in soup.select('a[href]'):
  43. print(elem.attrs['href'])
  44. if __name__ == '__main__':
  45. main()