반응형
뷰티풀 수프는 크롤링 프로젝트에서 scrapy와 같이 가장 많이 사용되는 도구중 하나다.
HTML 문서에서 필요한 부분만 출력해서, 크롤링의 속도를 올려주는 모듈이다.
# BeautifulSoup 추가하기
1
2
3
4
5
6
7
|
from bs4 import BeautifulSoup as bs
from urllib import request
url = 'https://www.example.com'
html = request.urlopen(url)
soup = bs(html, 'html.parser')
|
cs |
# 정갈하게 출력하기
prettyfy() 사용
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
from bs4 import BeautifulSoup as bs
from urllib import request
# prettify() 활용하기
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
soup = BeautifulSoup(markup)
soup.prettify()
# '<html>\n <head>\n </head>\n <body>\n <a href="http://example.com/">\n...'
print(soup.prettify())
# 출력결과
# <html>
# <head>
# </head>
# <body>
# <a href="http://example.com/">
# I linked to
# <i>
# example.com
# </i>
# </a>
# </body>
# </html>
|
cs |
# 태그 이름 사용하기
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
soup.head
# <head><title>The Dormouse's story</title></head>
soup.title
# <title>The Dormouse's story</title>
soup.body.b
# <b>The Dormouse's story</b>
soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
|
cs |
# contents / children 사용하기
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
# .contents 사용하기
head_tag = soup.head
head_tag
# <head><title>The Dormouse's story</title></head>
head_tag.contents
[<title>The Dormouse's story</title>]
title_tag = head_tag.contents[0]
title_tag
# <title>The Dormouse's story</title>
title_tag.contents
# [u'The Dormouse's story']
soup.contents[0].name
# u'html'
# 문자열은 contenst사용 불가함 / for 문 사용
text = title_tag.contents[0]
text.contents
# AttributeError: 'NavigableString' object has no attribute 'contents'
for child in title_tag.children:
print(child)
# The Dormouse's story
|
cs |
# descendants 사용하기
1
2
3
|
for child in head_tag.descendants:
print(child)
|
cs |
# string 사용하기
1
2
3
4
5
6
7
8
9
10
11
12
|
# 태그에 자손이 한개만 존재해야 string 사용가능함
title_tag.string
# u'The Dormouse's story'
head_tag.string
# u'The Dormouse's story'
# 하나 이상의 태그가 존재하면 string은 None을 반환한다.
print(soup.html.string)
# None
|
cs |
# stripped_string 사용하기
1
2
3
4
5
6
7
8
9
10
11
12
13
|
# string에서 불필요한 공백을 제거할 때 사용한다.
for string in soup.stripped_strings:
print(repr(string))
# u"The Dormouse's story"
# u"The Dormouse's story"
# u'Once upon a time there were three little sisters; and their names were'
# u'Elsie'
# u','
# u'Lacie'
# u'and'
# u'Tillie'
# u';\nand they lived at the bottom of a well.'
# u'...'
|
cs |
# parents 사용하기
선택한 요소 위로 올라가면서 탐색하는 도구다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
# parents는 선택한 요소 위에 있는 모든 상위 태그들을 검색한다.
link = soup.a
link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
for parent in link.parents:
if parent is None:
print(parent)
else:
print(parent.name)
# p
# body
# html
# [document]
# None
|
cs |
# next_sibling(s) / previous_sibling(s) 사용하기
동일 레벨에 있는 태그들을 가져온다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
sibling_soup.b.next_sibling
# <c>text2</c>
sibling_soup.c.previous_sibling
# <b>text1</b>
# next_siblings / previous_siblings는 동일 레벨 모든 태그 검색
for sibling in soup.a.next_siblings:
print(repr(sibling))
# u',\n'
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
# u' and\n'
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
# u'; and they lived at the bottom of a well.'
# None
for sibling in soup.find(id="link3").previous_siblings:
print(repr(sibling))
# ' and\n'
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
# u',\n'
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
# u'Once upon a time there were three little sisters; and their names were\n'
# None
|
cs |
# list 활용하기
1
2
3
4
5
6
|
soup.find_all(["a", "b"])
# [<b>The Dormouse's story</b>,
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
|
cs |
# limit 인자 활용하기
find_all() 메소드를 사용할 때 필요한 갯수만 필터링 하는 인자다.
1
2
3
4
|
soup.find_all("a", limit=2)
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
|
cs |
# recursive 인자 활용하기
전체 검색결과에서 직계자손 태그만 출력하는 인자다.
1
2
3
4
5
|
soup.html.find_all("title")
# [<title>The Dormouse's story</title>]
soup.html.find_all("title", recursive=False)
# []
|
cs |
# find_all() 활용하기
find_all()은 검색된 태그내 모든 태그를 검색한다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
# name 인자 활용
soup.find_all("title")
# [<title>The Dormouse's story</title>]
# css 활용
soup.find_all("p", "title")
# [<p class="title"><b>The Dormouse's story</b></p>]
# 태그 활용
soup.find_all("a")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
# 키워드 인자
soup.find_all(id="link2")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
# 정규표현식
import re
soup.find(text=re.compile("sisters"))
# u'Once upon a time there were three little sisters; and their names were\n'
css_soup.find_all("p", class_="body strikeout")
# [<p class="body strikeout"></p>]
soup.find_all("a", "sister")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
# 텍스트 활용
soup.find_all(text="Elsie")
# [u'Elsie']
|
cs |
# find() 활용하기
find()는 검색된 태그내 한개의 태그를 검색한다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
soup.find_all('title', limit=1)
# [<title>The Dormouse's story</title>]
soup.find('title')
# <title>The Dormouse's story</title>
# 태그 이름을 사용해서 검색하는 것은
# find()함수를 반복 실행하는 것이다.
soup.head.title
# <title>The Dormouse's story</title>
soup.find("head").find("title")
# <title>The Dormouse's story</title>
|
cs |
# find_parent(s)() 활용하기
검색 태그의 상위 태그들을 검색한다.
1
2
3
4
5
6
7
8
9
|
a_string.find_parents("a")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
a_string.find_parent("p")
# <p class="story">Once upon a time there were three little sisters; and their names were
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</p>
|
cs |
# find_next_sibling(s)()
동일 레벨의 다음 태그들을 검색한다.
1
2
3
4
5
6
7
|
first_link.find_next_siblings("a")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_next_sibling("p")
# <p class="story">...</p>
|
cs |
# find_previous_sibling(s)()
동일 레벨의 이전 태그들을 검색한다.
1
2
3
4
5
6
7
|
last_link.find_previous_siblings("a")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_previous_sibling("p")
# <p class="title"><b>The Dormouse's story</b></p>
|
cs |
# find_all_next() / find_next() 활용하기
find_all_next() : 동일 레벨의 모든 다음 태그들을 검색한다.
find_next() : 동일 레벨의 한개의 다음 태그를 검색한다.
1
2
3
4
5
6
|
first_link.find_all_next(text=True)
# [u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie',
# u';\nand they lived at the bottom of a well.', u'\n\n', u'...', u'\n']
first_link.find_next("p")
# <p class="story">...</p>
|
cs |
# find_all_previous() / find_previous() 활용하기
find_all_next() : 동일 레벨의 모든 이전 태그들을 검색한다.
find_next() : 동일 레벨의 한개의 이전 태그를 검색한다.
1
2
3
4
5
6
|
first_link.find_all_previous("p")
# [<p class="story">Once upon a time there were three little sisters; ...</p>,
# <p class="title"><b>The Dormouse's story</b></p>]
first_link.find_previous("title")
# <title>The Dormouse's story</title>
|
cs |
# CSS 선택자 활용하기
# 태그 검색
1
2
|
soup.select("title")
# [<title>The Dormouse's story</title>]
|
cs |
# 태그 아래의 태그를 검색한다.
1
2
3
4
|
soup.select("body a")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
|
cs |
# 태그 바로 아래의 태그를 검색한다.
1
2
3
4
5
6
7
8
9
10
|
soup.select("head > title")
# [<title>The Dormouse's story</title>]
soup.select("p > a")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select("body > a")
# []
|
cs |
# CSS 클래스로 검색한다.
1
2
3
4
5
6
7
8
9
10
|
soup.select(".sister")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select("[class~=sister]")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
|
cs |
# ID 클래스로 검색한다.
1
2
3
4
5
6
|
soup.select("#link1")
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
soup.select("a#link2")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
|
cs |
# 속성이 존재하는지 테스트 한다
1
2
3
4
5
|
soup.select('a[href]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
|
cs |
# 속성으로 태그를 찾는다.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
soup.select('a[href="http://example.com/elsie"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
soup.select('a[href^="http://example.com/"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select('a[href$="tillie"]')
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select('a[href*=".com/el"]')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
|
cs |
# href 속성 찾기
# 태그안의 속성을 찾는다.
tag.attrs['속성값']
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | html = """ <html> <body> <h1>Test</h1> <div class="row"> <p>This is Test</p> <a href="https://www.test.com">This is a tag</a> </div> </body> </html> """ soup = bs(html, 'html.parser') print(soup.a.string) print(soup.a.attrs['href']) | cs |
반응형
'Programming > Crawling' 카테고리의 다른 글
파이썬 urllib.request VS requests 차이점? (0) | 2021.12.18 |
---|---|
파이썬 urllib.request VS requests 차이점? (0) | 2021.12.18 |
URL URI URN 이란? (0) | 2021.12.17 |
CrawlSpider Rule 10분만에 이해하기 (0) | 2021.12.06 |
댓글