[Python] Python 22일차(파이썬을 이용한 머신러닝, 딥러닝 실전 개발 입문)

2018. 7. 2. 17:33

크롤링, 스크래핑을 위한 기초단계의 내용이다.

특별히 어려운 부분은 없었다.

urllib과 bs4를 항상 같이 쓴다는 점을 기억하면 좋을 것이다.

여기 나오는 내용은 "파이썬을 이용한 머신러닝, 딥러닝, 실전 개발 입문"에 나오는 내용이다.

========================= Python =========================

##### 데이터 다운로드

### urllib.request를 이용한 다운로드

# 라이브러리 읽어 들이기

import urllib.request

# URL과 저장 결로 지정하기

url = 'http://uta.pw/shodou/img/28/214.png'

savename = 'test.png'

# urlretrieve()를 이용한 다운로드

urllib.request.urlretrieve(url, savename)

print('저장되었습니다.')

# urlopen()을 이용한 다운로드

mem = urllib.request.urlopen(url).read()

with open(savename, mode="wb") as f: # wb가 아닌 w로 하면 다운은 되지만 파일을 열수가 없다.

f.write(mem)

print('저장되었습니다.')

##### 웹에서 데이터 추출하기

### 클라이언트 접속 정보 출력해보기

# 데이터 읽어 들이기

url = "http://api.aoikujira.com/ip/ini"

res = urllib.request.urlopen(url) # <class 'http.client.HTTPResponse'>

data = res.read() # <class 'bytes'>

# 바이너리를 문자열로 변환하기

text = data.decode("utf-8") # <class 'str'>

print(text)

### 매개변수를 추가해 요청을 전송하는 방법

import urllib.request

import urllib.parse

API = "http://www.kma.go.kr/weather/forecast/mid-term-rss3.jsp"

# 매개변수를 URL 인코딩합니다.

values = {

'stnId': '109'

}

params = urllib.parse.urlencode(values)

# 요청 전용 URL을 생성합니다.

url = API + "?" + params

print("url=", url)

# 다운로드합니다.

data = urllib.request.urlopen(url).read()

text = data.decode('utf-8')

print(text)

### 매개변수를 명령줄에서 지정하기

# !/usr/bin/env python3

# 라이브러리를 읽어 들입니다.

import sys

import urllib.request as req

import urllib.parse as parse

# 명령줄 매개변수 추출

if len(sys.argv) <= 1:

print("USAGE: download-forecast-argv <Region Number>")

sys.exit()

regionNumber = sys.argv[1]

# 매개변수를 URL 인코딩합니다.

API = "http://www.kma.go.kr/weather/forecast/mid-term-rss3.jsp"

values = {

'stnId': regionNumber

}

params = parse.urlencode(values)

url = API + "?" + params

print("url=", url)

# 다운로드합니다.

data = req.urlopen(url).read() # <class 'bytes'>

text = data.decode('utf-8') # <class 'str'>

print(text)

##### BeautifulSoup로 스크레이핑하기

### BeautifulSoup 기본 사용법

# 라이브러리 읽어 들이기

from bs4 import BeautifulSoup

# 분석하고 싶은 HTML

html = """

<h1>스크레이핑이란?</h1>

<p>웹 페이지를 분석하는 것</p>실험이다.

</body></html>

"""

# HTML 분석하기

soup = BeautifulSoup(html, 'html.parser') # html 형태로 parsing을 한다.

# 원하는 부분 추출하기

h1 = soup.html.body.h1 # 저장한 html객체에 <body> 내부에 있는 <h1> 태그를 찾아서 h1 객체에 넣는다.

p1 = soup.html.body.p

p2 = p1.next_sibling.next_sibling # p1에서 찾는 맨 처음 <p> 태그 다음으로 있는 <p> 태그를 찾아간다.

# 여기서 .next_sibling을 두 번 쓴 이유는 한 번만 쓰면 닫는 태그인 </p>을 찾아가기 때문이다.

# .next_sibling을 한 번만 쓰면 '실험이다.' 가 출력되는 것을 확인 할 수 있다.

# 이전으로 찾아가려면 previous_sibling을 이용한다.

# 요소의 글자 출력하기

print("h1 = " + h1.string) # .string을 붙이지 않으면 태그 전체 값이 출력된다. <h1>스크레이핑이란?</h1>

print("p1 = " + p1.string)

print("p2 = " + p2.string)

### id로 요소를 찾는 법

from bs4 import BeautifulSoup

html = """

<h1 id="title">스크레이핑이란?</h1>

</body></html>

"""

# HTML 분석하기

soup = BeautifulSoup(html, 'html.parser')

# find() 메서드로 원하는 부분 추출하기

title = soup.find(id="title")

body = soup.find(id="body")

# 텍스트 부분 출력하기

print("#title =", title.string)

print('#body =', body.string)

### 여러 개의 요소 추출하기 - find_all() 메서드

from bs4 import BeautifulSoup

html = """

<ul>

<li><a href="http://www.naver.com">naver</a></li>

</ul>

</body></html>

"""

# HTML 분석하기

soup = BeautifulSoup(html, 'html.parser')

# find_all() 메서드로 추출하기

links = soup.find_all("a") # find_all('a'). 앵커 태그가 붙은 모든 것을 찾기.

# 링크 목록 출력하기

for a in links:

href = a.attrs['href']

text = a.string

print(text, ">", href)

### urlopen()과 BeautifulSoup 조합하기

from bs4 import BeautifulSoup

import urllib.request as req

url = "http://www.kma.go.kr/weather/forecast/mid-term-rss3.jsp" # XML 구조이다.

# urlopen()으로 데이터 가져오기

res = req.urlopen(url)

# BeautifulSoup으로 분석하기

soup = BeautifulSoup(res, "html.parser")

# 원하는 데이터 추출하기

title = soup.find("title").string # <title> 태그를 검색해서 안의 내용을 string으로 가져온다.

# .string 메서드를 안 쓰면 <class 'bs4.element.Tag'> 타입이다.

wf = soup.find('wf').string # <wf> 태그를 검색해서 안의 내용을 string으로 가져온다.

print(title)

print(wf)

### CSS 선택자 사용하기

from bs4 import BeautifulSoup

# 분석 대상 HTML

html = """

<li>스위프트로 시작하는 아이폰 앱 개발 교과서</li>

</ul>

<div>

</html></body>

"""

# HTML 분석하기

soup = BeautifulSoup(html, 'html.parser')

# 필요한 부분을 CSS 쿼리로 추출하기

# 타이틀 부분 추출하기

h1 = soup.select_one('div#meigen > h1').string # <div id="meigen">의 <h1>을 검색.

# div#meigen은 jQuery의 형태를 생각할 것.

print("h1 = " + h1)

# 목록 부분 추출하기

li_list = soup.select("div#meigen > ul.items > li") # <div id="meigen">의 <ul class="items">의 <li>들을 검색.

# div#meigen, ul.items은 jQuery의 형태를 생각할 것.

for li in li_list:

print("li =", li.string)

### 네이버 금융에서 환율 정보 추출하기

from bs4 import BeautifulSoup

import urllib.request as req

# HTML 가져오기

url = "http://info.finance.naver.com/marketindex/"

res = req.urlopen(url)

# HTML 분석하기

soup = BeautifulSoup(res, "html.parser")

# 원하는 데이터 추출하기

price = soup.select_one("div.head_info > span.value").string

print('usd/krw =', price)

##### CSS 선택자

### 위키 문헌에 공개돼 있는 윤동주 작가의 작품 목록 가져오기

from bs4 import BeautifulSoup

import urllib.request as req

# 위키 윤동주 페이지를 받아와서 BeautifulSoup을 이용해 parsing한다.

url = "https://ko.wikisource.org/wiki/%EC%A0%80%EC%9E%90:%EC%9C%A4%EB%8F%99%EC%A3%BC"

res = req.urlopen(url)

soup = BeautifulSoup(res, "html.parser")

# div#mw-content-text 아래에 있는

# div.mw-parser-output 아래에 있는

# ul 태그 아래에 있는

# li 태그 아래에 있는

# a 태그를 모두 선택한다.

a_list = soup.select("div#mw-content-text > div.mw-parser-output > ul > li > a")

# 출력한다.

for a in a_list:

name = a.string

print("-", name)

### CSS 선택자로 추출 연습하기

# 필요한 라이브러리 가져오기

from bs4 import BeautifulSoup

# html파일을 불러오고 BeautifulSoup 라이브러리를 이용하여 soup 객체에 저장하기

file = open("books.html", encoding="UTF-8")

soup = BeautifulSoup(file, 'html.parser')

# 선택자를 통해 선택한 것을 리스트로 만들기

bible_list = soup.select("ul > li")

# 리스트를 '- (이름)' 형태로 만들어서 출력

for a in bible_list:

name = a.string

print("-", name)

### CSS 선택자로 과일과 야채 선택해보기

# 필요한 라이브러리 가져오기

from bs4 import BeautifulSoup

fp = open("fruits-vegetables.html", encoding='utf-8')

soup = BeautifulSoup(fp, "html.parser")

# CSS 선택자로 추출하기 / 전부 아보카도이다.

print(soup.select_one("li:nth-of-type(8)").string)

print(soup.select_one("#ve-list > li:nth-of-type(4)").string)

print(soup.select("#ve-list > li[data-lo='us']")[1].string)

print(soup.select("#ve-list > li.black")[1].string)

# find 메서드로 추출하기 / 전부 아보카도이다.

cond = {"data-lo": "us", "class": "black"}

print(soup.find("li", cond).string)

# find 메서드를 연속적으로 사용하기 / 아보카도이다.

print(soup.find(id="ve-list").find("li", cond).string)

### 정규 표현식과 함께 조합하기

from bs4 import BeautifulSoup

import re # 정규 표현식을 사용할 때

html = """

<ul>

</ul>

"""

soup = BeautifulSoup(html, 'html.parser')

# 정규 표현식으로 href에서 https인 것 추출하기

li = soup.find_all(href=re.compile(r"^https://"))

for e in li:

print(e.attrs['href'])

### 정규식 내용 추가

data = """

홍길동 800905-1049118

전우치 700905-1039119

"""

# 정규식 없이 코딩

result = []

for line in data.split("\n"):

word_result = []

for word in line.split(" "):

if len(word) == 14 and word[:6].isdigit():

word = word[:6] + "-" + "*******"

word_result.append(word)

result.append(" ".join(word_result))

print("\n".join(result))

# 정규식 사용

import re

pat = re.compile("(\d{6})[-](\d{7})") # 원하는 형태의 정규표현식을 .compile() 메서드를 이용하여 정의한다.

print(pat.sub("\g<1>-*******", data))

##### 링크에 있는 것을 한꺼번에 내려받기

### 상대 경로를 전개하는 방법 - 1

from urllib.parse import urljoin

base = "http://example.com/html/a.html"

print(urljoin(base, "b.html"))

print(urljoin(base, "sub/c.html"))

print(urljoin(base, "../index.html"))

print(urljoin(base, "../img/hoge.png"))

print(urljoin(base, "../css/hoge.css"))

### 상대 경로를 전개하는 방법 - 2

from urllib.parse import urljoin

base = "http://example.com/html/a.html"

print(urljoin(base, "/hoge.html"))

print(urljoin(base, "http://otherExample.com/wiki"))

print(urljoin(base, "//anotherExample.org/test"))

========================= Python =========================

'프로그래밍 > Python, R 프로그래밍' 카테고리의 다른 글

[Python] Python 24일차(파이썬을 이용한 머신러닝, 딥러닝 실전 개발 입문) (0)	2018.07.04
[Python] Python 23일차(파이썬을 이용한 머신러닝, 딥러닝 실전 개발 입문) (0)	2018.07.03
[Python] Python 21일차(예제로 배우는 파이썬 데이터 시각화) (0)	2018.06.20
[Python] Python 20일차(예제로 배우는 파이썬 데이터 시각화) (0)	2018.06.19
[Python] Python 19일차(예제로 배우는 파이썬 데이터 시각화) (0)	2018.06.15

데이터 분석가 블로그

[Python] Python 22일차(파이썬을 이용한 머신러닝, 딥러닝 실전 개발 입문)

'프로그래밍 > Python, R 프로그래밍' 카테고리의 다른 글

+ Recent posts

티스토리툴바