[Python] Python 28일차(파이썬을 이용한 머신러닝, 딥러닝 실전 개발 입문)

2018. 7. 16. 18:37

한국어 분석(형태소 분석), Word2Vec으로 문장을 벡터로 변환하기를 해보았다.

두 단어의 positivity, negativity를 이용하여 문서 내에서 두 단어의 상관관계라던가 벡터로 변환한 단어들의 관계 등을 알 수 있었다.

여기 나오는 내용은 "파이썬을 이용한 머신러닝, 딥러닝, 실전 개발 입문"에 나오는 내용이다.

========================== Python ==========================

import codecs

from bs4 import BeautifulSoup

from konlpy.tag import Twitter

# utf-16 인코딩으로 파일을 열고 글자를 출력하기

fp = codecs.open("BEXX0003.txt", "r", encoding="utf-16")

soup = BeautifulSoup(fp, "html.parser")

body = soup.select_one("body > text") # body 태그 아래에 있는 text 선택

text = body.getText() # 위에서 선택한 text 가져오기

# 텍스트를 한 줄씩 처리하기

twitter = Twitter()

word_dic = {}

lines = text.split("\n") # 줄바꿈을 기본 단위로 분해한다.

for line in lines:

malist = twitter.pos(line) # 줄바꿈을 하고 각 요소를 분석한다. ('제', 'Noun') 이런 형태로 분석하게 된다.

# print(malist) # 분해한 요소 확인

'''

[]

[('제', 'Noun'), ('1', 'Number'), ('편', 'Noun'), ('어둠', 'Noun'), ('의', 'Josa'), ('발', 'Noun'), ('소리', 'Noun')]

[('서', 'Verb'), ('(', 'Punctuation'), ('序', 'Foreign'), (')', 'Punctuation')]

[('1897', 'Number'), ('년', 'Noun'), ('의', 'Josa'), ('한가위', 'Noun'), ('.', 'Punctuation')]

...

'''

for word in malist:

if word[1] == "Noun": # 명사 확인하기. 데이터가 ('제', 'Noun')처럼 나오기 때문에

# 두번째 자료('Noun')를 읽어야 명사인지 아닌지 구별이 가능

if not (word[0] in word_dic): # 사전타입으로 카운트하기

word_dic[word[0]] = 1 # 만약 사전에 데이터가 없으면 새로 만들고 카운트 1을 넣기

word_dic[word[0]] += 1 # 만약 있는 단어라면 뒤의 카운트에 1을 더하기

# 많이 사용된 명사 출력하기

# print(word_dic.items()) # dict_items([('직일놈', 2), ('누설', 2), ('껄', 6), ('씨끄럽', 3), ('영문', 5),...])

keys = sorted(word_dic.items(), key=lambda x: x[1], reverse=True) # word_dic에 있는 value만 lambda를 이용해서 꺼낸다.

# reverse=True를 이용해서 내림차순 정렬

for word, count in keys[:50]: # 상위 50개의 키만 선택한다.

print("{0}({1})".format(word, count), end="\n") # 위에서 선택한 것을 앞에는 word, 뒤에는 (count) 형태로 출력한다.

'''

'결과'

것(629)

그(520)

말(378)

안(305)

소리(199)

...

'''

### Gensim의 Word2vec으로 "토지"를 읽어보기

## 오류를 없애는 방법

import warnings

warnings.filterwarnings(action='ignore', category=UserWarning)

warnings.filterwarnings(action='ignore', category=FutureWarning)

import codecs

from bs4 import BeautifulSoup

from konlpy.tag import Twitter

from gensim.models import word2vec

# utf-16 인코딩으로 파일을 열고 글자를 출력하기

fp = codecs.open("BEXX0003.txt", "r", encoding="utf-16")

soup = BeautifulSoup(fp, "html.parser")

body = soup.select_one("body > text")

text = body.getText()

# 텍스트를 한 줄씩 처리하기

twitter = Twitter()

results = []

lines = text.split("\n")

for line in lines:

# 형태소 분석하기

# 단어의 기본형 사용

malist = twitter.pos(line, norm=True, stem=True)

r = []

for word in malist:

# 어미/조사/구두점 등은 대상에서 제외

if not (word[1] in ["Josa", "Eomi", "Punctuation"]): # 어미/조사/구두점 등은 대상에서 제외

r.append(word[0])

rl = (" ".join(r)).strip() # " "를 이용해서 r의 리스트 값들을 합치고 좌우의 빈칸을 없앤다.

results.append(rl)

print(rl)

# 파일로 출력하기

wakati_file = 'toji.wakati' # 파일을 toji.wakati파일로 저장한다.

with open(wakati_file, 'w', encoding='utf-8') as fp:

fp.write("\n".join(results))

# Word2Vec 모델 만들기

data = word2vec.LineSentence(wakati_file)

model = word2vec.Word2Vec(data, size=200, window=10, hs=1, min_count=2, sg=1)

model.save("toji.model") # toji.model로 저장

print("ok")

# 모델 출력하기

import warnings

warnings.filterwarnings(action='ignore', category=UserWarning)

warnings.filterwarnings(action='ignore', category=FutureWarning)

from gensim.models import word2vec

from pprint import pprint

# 모델 로드하기

model = word2vec.Word2Vec.load("toji.model")

# 모델에서 가장 가까운 단어를 찾는 알고리즘을 실행하기

a = model.most_similar(positive=["땅"]) # 가까운 단어

b = model.most_similar(negative=["땅"]) # 먼 단어

pprint(a)

pprint(b)

### 위키피디아 문서를 이용하여 word2vec 사용해보기

## 모델 만들기 전처리

import warnings

warnings.filterwarnings(action='ignore', category=UserWarning)

warnings.filterwarnings(action='ignore', category=FutureWarning)

import codecs

from bs4 import BeautifulSoup

from konlpy.tag import Twitter

from gensim.models import word2vec

# 파일 열기

readFp = codecs.open("wiki.txt", "r", encoding="utf-8")

wakati_file = "wiki.wakati"

writeFp = open(wakati_file, "w", encoding="utf-8")

# 형태소 분석

twitter = Twitter()

i = 0

# 텍스트를 한 줄씩 처리하기

while True:

line = readFp.readline()

if not line:

break

if i % 20000 == 0:

print("current -" + str(i))

i += 1

# 형태소 분석

malist = twitter.pos(line, norm=True, stem=True)

# 필요한 어구만 대상으로 하기

r = []

for word in malist:

# 어미/조사/구두점 등은 대상에서 제외

if not (word[1] in ["Josa", "Eomi", "Punctuation"]):

writeFp.write(word[0] + " ")

writeFp.close()

## wiki.wakati를 가지고 모델을 생성하기

import warnings

warnings.filterwarnings(action='ignore', category=UserWarning)

warnings.filterwarnings(action='ignore', category=FutureWarning)

from gensim.models import word2vec

data = word2vec.Text8Corpus('wiki.wakati')

model = word2vec.Word2Vec(data, size=100)

model.save("wiki.model")

print("ok")

## 모델을 이용하기

import warnings

warnings.filterwarnings(action='ignore', category=UserWarning)

warnings.filterwarnings(action='ignore', category=FutureWarning)

from gensim.models import word2vec

# 모델을 로드하기

model = word2vec.Word2Vec.load("wiki.model")

# 모델을 이용하여 원하는 자료 얻기

print(model.most_similar(positive=["Python", "파이썬"]))

print(model.most_similar(positive=["아빠", "여성"], negative=["남성"])[0])

print(model.most_similar(positive=["왕자", "여성"], negative=['남성']))

print(model.most_similar(positive=["서울", "일본"], negative=['한국']))

print(model.most_similar(positive=["서울", "중국"], negative=["한국"]))

print(model.most_similar(positive=["오른쪽", "남자"], negative=["왼쪽"]))

print(model.most_similar(positive=["서울", "맛집"]))

========================== Python ==========================

'프로그래밍 > Python, R 프로그래밍' 카테고리의 다른 글

[Python] 파이썬을 이용한 여러가지 플롯 그리기 (0)	2018.07.29
[Python] Python 27일차(파이썬을 이용한 머신러닝, 딥러닝 실전 개발 입문) (0)	2018.07.11
[Python] matplotlib 라이브러리에서 한글 폰트가 깨지지 않게 만드는 방법 (0)	2018.07.10
[Python] Python 26일차(파이썬을 이용한 머신러닝, 딥러닝 실전 개발 입문) (0)	2018.07.10
[Python] Python 25일차(파이썬을 이용한 머신러닝, 딥러닝 실전 개발 입문) (0)	2018.07.09

데이터 분석가 블로그

[Python] Python 28일차(파이썬을 이용한 머신러닝, 딥러닝 실전 개발 입문)

'프로그래밍 > Python, R 프로그래밍' 카테고리의 다른 글

+ Recent posts

티스토리툴바