IT is Smart

BeautifulSoup4 예제 본문

Programming/Text Mining

BeautifulSoup4 예제

달인최선 2016. 9. 1. 21:37
반응형
#coding=utf-8
#!/usr/bin/python

from bs4 import BeautifulSoup


html_doc = """
IT is Smart<p class="title2"><b>IT is Smart</b></p>
<p class="title"><b>IT is Smart</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""


soup = BeautifulSoup(html_doc)

# HTML 들여쓰기하여 출력하기
# -----------------------------------
# print(soup.prettify())

# 
#   <title>
#    IT is Smart
#   </title>
#  
#   <p class="title">
#    <b>
#.....
#    ...
#   </b></p><b>
# 


print 'soup.title= ', soup.title
# <title>IT is Smart</title>

print 'soup.title.name=', soup.title.name
# u'title'

print 'soup.title.string=',soup.title.string
# u'IT is Smart'

print 'soup.title.parent.name=', soup.title.parent.name
# u'head'

print 'soup.title.parent.string = ', soup.title.parent.string

print 'soup.p = ',soup.p

# <p class="title"><b>IT is Smart</b></p>

print 'soup.p[\'class\']=', soup.p['class']
soup.p['class']
# u'title'

print 'soup.a = ', soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

# 모든 a 태그 찾기
print "soup.find_all('a')=", soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

print 'soup.find(id="link3")=', soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

for link in soup.find_all('a'):
    print(link.get('href'))
# http://example.com/elsie
# http://example.com/lacie
# http://example.com/tillie

print '================='

print(soup.get_text())
# IT is Smart
#
# IT is Smart
#
# Once upon a time there were three little sisters; and their names were
# Elsie,
# Lacie and
# Tillie;
# and they lived at the bottom of a well.
#
# ...

text1="""
<li><a href="/html/photo03/18932.html" target="_blank"><span>2013-10-09</span> 가을하늘은 높고 푸르다.</a></li>
<li><a href="/html/photo03/18931.html" target="_blank"><span>2013-10-09</span> 천고마비</a></li>
<li><a href="/html/photo03/18930.html" target="_blank"><span>2013-10-09</span> 무더운 여름이 가고 수확의 가을이 오다.</a></li>
"""

print '-----------------------------------'
soup = BeautifulSoup(text1)
for link in soup.find_all('a'):
    print(link.get('href'))
    print link.span.string
    print link.contents[0].string, link.contents[1]
    print '\n\n'
    
text2="""
<span class="next">다음장:<a href="/html/photo03/18931.html"> [32]</a></span>
<span class="next">다음장:<a href="/html/photo03/18931.html"> [32]</a></span>
<span class="nexts">다음장:<a href="/html/photo03/18931.html">2rrrrr</a></span>
<span class="nexts">다음장:<a href="/html/photo03/18931.html">3rrrrr</a></span>
<p class="class1">1</p>
<p class="class1 class2">2</p>
<p class="class2">3</p>
"""  
soup = BeautifulSoup(text2)
# print soup.find(class="next").a.get('href')

# next속성이 포함된 모든 태크 찾기
list_next = soup.findAll(True, 'next')[0].a.get('href')

print list_next

'''
In [2]: soup = bs4.BeautifulSoup('<div class="foo bar"></div>')
In [3]: soup(attrs={'class': 'bar'})
Out[3]: [<div class="foo bar"></div>]
'''

soup.findAll(True, {'class': re.compile(r'\bclass1\b')})


반응형