example02.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. from bs4 import BeautifulSoup
  2. import re
  3. def main():
  4. html = """
  5. <!DOCTYPE html>
  6. <html lang="en">
  7. <head>
  8. <meta charset="UTF-8">
  9. <title>首页</title>
  10. </head>
  11. <body>
  12. <h1>Hello, world!</h1>
  13. <p>这是一个<em>神奇</em>的网站!</p>
  14. <hr>
  15. <div>
  16. <h2>这是一个例子程序</h2>
  17. <p>静夜思</p>
  18. <p class="foo">床前明月光</p>
  19. <p id="bar">疑似地上霜</p>
  20. <p class="foo">举头望明月</p>
  21. <div><a href="http://www.baidu.com"><p>低头思故乡</p></a></div>
  22. </div>
  23. <a class="foo" href="http://www.qq.com">腾讯网</a>
  24. <img src="./img/pretty-girl.png" alt="美女">
  25. <img src="./img/hellokitty.png" alt="凯蒂猫">
  26. <img src="/static/img/pretty-girl.png" alt="美女">
  27. <table>
  28. <tr>
  29. <th>姓名</th>
  30. <th>上场时间</th>
  31. <th>得分</th>
  32. <th>篮板</th>
  33. <th>助攻</th>
  34. </tr>
  35. </table>
  36. </body>
  37. </html>
  38. """
  39. soup = BeautifulSoup(html, 'lxml')
  40. # JavaScript - document.title
  41. print(soup.title)
  42. # JavaScript - document.body.h1
  43. print(soup.body.h1)
  44. print(soup.p)
  45. print(soup.body.p.text)
  46. print(soup.body.p.contents)
  47. for p_child in soup.body.p.children:
  48. print(p_child)
  49. print(len([elem for elem in soup.body.children]))
  50. print(len([elem for elem in soup.body.descendants]))
  51. print(soup.findAll(re.compile(r'^h[1-6]')))
  52. print(soup.body.find_all(r'^h'))
  53. print(soup.body.div.find_all(re.compile(r'^h')))
  54. print(soup.find_all(re.compile(r'r$')))
  55. print(soup.find_all('img', {'src': re.compile(r'\./img/\w+.png')}))
  56. print(soup.find_all(lambda x: len(x.attrs) == 2))
  57. print(soup.find_all(foo))
  58. print(soup.find_all('p', {'class': 'foo'}))
  59. for elem in soup.select('a[href]'):
  60. print(elem.attrs['href'])
  61. def foo(elem):
  62. return len(elem.attrs) == 2
  63. if __name__ == '__main__':
  64. main()