example02.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. from bs4 import BeautifulSoup
  2. import re
  3. def main():
  4. html = """
  5. <!DOCTYPE html>
  6. <html lang="en">
  7. <head>
  8. <meta charset="UTF-8">
  9. <title>首页</title>
  10. </head>
  11. <body>
  12. <h1>Hello, world!</h1>
  13. <p>这是一个神奇的网站!</p>
  14. <hr>
  15. <div>
  16. <h2>这是一个例子程序</h2>
  17. <p>静夜思</p>
  18. <p class="foo">床前明月光</p>
  19. <p id="bar">疑似地上霜</p>
  20. <p class="foo">举头望明月</p>
  21. <div><a href="http://www.baidu.com"><p>低头思故乡</p></a></div>
  22. </div>
  23. <a class="foo" href="http://www.qq.com">腾讯网</a>
  24. <img src="./img/pretty-girl.png" alt="美女">
  25. <img src="./img/hellokitty.png" alt="凯蒂猫">
  26. <img src="/static/img/pretty-girl.png" alt="美女">
  27. <table>
  28. <tr>
  29. <th>姓名</th>
  30. <th>上场时间</th>
  31. <th>得分</th>
  32. <th>篮板</th>
  33. <th>助攻</th>
  34. </tr>
  35. </table>
  36. </body>
  37. </html>
  38. """
  39. soup = BeautifulSoup(html, 'lxml')
  40. # JavaScript - document.title
  41. print(soup.title)
  42. # JavaScript - document.body.h1
  43. print(soup.body.h1)
  44. print(soup.find_all(re.compile(r'^h')))
  45. print(soup.find_all(re.compile(r'r$')))
  46. print(soup.find_all('img', {'src': re.compile(r'\./img/\w+.png')}))
  47. print(soup.find_all(lambda x: len(x.attrs) == 2))
  48. print(soup.find_all('p', {'class': 'foo'}))
  49. for elem in soup.select('a[href]'):
  50. print(elem.attrs['href'])
  51. if __name__ == '__main__':
  52. main()