(자연어 처리/ word 기반 RNN) 한국어 시 짓기

char 기반 RNN에서 아쉬웠던 점을 word 기반 RNN에서 만회하려고 word 기반 RNN 코드를 찾아 나섰습니다.

<밑바닥에서 시작하는 딥러닝 2>의 6장~7장에서 소개한 word 기반 RNN도 다시 한번 시도해보려고 합니다.

그전에 다른 코드로 테스트해보고 싶어서 아래의 코드를 발견했습니다.

https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/

How to Develop a Word-Level Neural Language Model and Use it to Generate Text

A language model can predict the probability of the next word in the sequence, based on the words already observed […]

machinelearningmastery.com

이 분의 소스 코드는 무척 간결하고 설명이 자세합니다.

별다른 수정없이 적용할 수 있을 정도네요.

물론 전처리 과정은 저랑 다를 수밖에 없습니다. 영어와 한글의 차이도 있고, 문장의 종류 차이도 있으니까요.

제이슨 씨의 train 코드를 저는 이렇게 수정했습니다.

1) 원핫 벡터를 사용하면 메모리 소모가 심합니다. 저는 16GB를 사용 중인데 메모리 부족으로 프로세서가 죽어버리더군요. batch_size를 한없이 줄이고, 자료형을 최대한 줄여서(가령 int8로) 사용하면 가능하지만, 저는 원핫 벡터를 포기하기로 했습니다.

to_categorical 함수를 사용하는 부분을 주석 처리하고,

#y = to_categorical(y, num_classes=vocab_size, dtype='int8') # (118633, 7410) (228064, 61342 )

모델 컴파일 시, loss 함수값을 다른 것으로 지정해줍니다.

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

2) 매번 train 코드를 실행할 때마다 가중치 학습값을 처음부터 만들 것이 아니라, 학습 재개 시에 기존 가중치값을 불러오는 식으로 코드를 수정했습니다. 그래서 틈날 때마다 조금씩 훈련을 시킬 수 있지요.

keep_training = 0 # 1: 훈련 재개, 0: 처음 훈련
if keep_training: # 훈련 재개라면, 기존 학습한 모델을 불러온다.
model = load_model('model.h5')

위 코드를 추가하는데, keep_training 값을 0으로 하면 처음 훈련하거나 처음부터 다시 훈련하는 식이고, 1로 설정하면 학습한 모델을 불러와서 훈련시킵니다.

제가 수정한 코드는 다음과 같습니다.

import numpy as np
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

# 문서를 메모리로 불러오기
def load_doc(filename):
  # 파일을 읽기 모드로 열기
  with open(filename, 'r') as f:
    text = f.read()
  return text

# load
#in_filename = "republic_sequences.txt"
in_filename = "poems_sequences.txt"

doc = load_doc(in_filename)
#doc = doc[:len(doc) // 3] # 데이터를 1/3 으로 줄임
lines = doc.split('\n')

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

# tokenizer.word_index
# ' '.join([tokenizer.index_word[w] for w in sequences[0]])

vocab_size = len(tokenizer.word_index) + 1  # 7410

# separate into input and output
sequences = sequences[:-1] # 리스트 마지막에 있는 짜투리 리스트 제외
sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:, -1]
# 원핫 벡터를 만들면 너무 많은 메모리를 사용하기 때문에, 원핫 벡터를 만들지 않는다.
#y = to_categorical(y, num_classes=vocab_size, dtype='int8') # (118633, 7410) (228064, 61342 )
seq_length = X.shape[1]  # 50

# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

# compile model
# loss값을 원핫 백터 loss가 아닌 sparse로 지정한다.
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

keep_training = 1 # 1: 훈련 재개, 0: 처음 훈련
if keep_training: # 훈련 재개라면, 기존 학습한 모델을 불러온다.
  model = load_model('model.h5')

# fit model
model.fit(X, y, batch_size=1024, epochs=40)

# save the model to file
model.save("model.h5")
# save the tokenizer
with open('tokenizer.pkl', 'wb') as f:
  pickle.dump(tokenizer, f)

다음은 전처리 코드입니다.

전처리는 제가 별도로 만들어서 처리했습니다.

시 입력 양식 프로그램에서 출력할 때 약간의 전처리를 하기도 하지만,

그외에도 여러가지 전처리를 했습니다.

import re

#in_filename = "poems.txt"
# out_filename = "poems_clean.txt"
in_filename = "korean_poems_7961.txt"
out_filename = "poems_clean_7961.txt"


with open(in_filename, 'r') as f:
  text = f.read()

#text = re.sub("^[0-9]+\n", "\n", text)
#text = re.sub("^[0-9]+.$", "", text)
#text = re.sub("\n\n\n\n\n", " eop ", text)
text = re.sub("\n\n\n\n", " eop ", text)
text = re.sub("\n\n\n", " eop ", text)
text = re.sub("\n\n", "\n", text) # 
text = re.sub("\n", " ", text)

text = re.sub("\([^\)]+\)", "", text) # ( )안의 내용 제거, 한문/영어
text = re.sub("\[[^\]]+\]", "", text) # [] 안의 내용 제거, 한문/영어/기타

punctuation = "/`『』「」…';:()\[\]<>\"--,―~Ⅱ∼·!－.*?" # 모든 구둣점 제거
text = re.sub("[" + punctuation + "]", "", text)
# 영어 문자를 삭제하기 전에 'eop'를 '\n'으로 대체해서 삭제되지 않도록 한다.
text = re.sub("eop", "\n", text) # 다시 개행문자를 넣음
text = re.sub("[A-Za-z]+", " ", text)  # 본문의 영어 문자를 삭제
text = re.sub("[0-9]+", "N ", text) # 모든 숫자를 'N'으로 표기
#text = re.sub("eop", "\n", text) # 다시 개행문자를 넣음

with open(out_filename, 'w') as f:
  f.write(text)

#not_hangul = re.compile(r"[^ㄱ-ㅣ가-힣 \n]+")              #'[^가-힇ㄱ-ㅎㅏ-ㅣ \n]+')

#hanja_regex = "[\u2e80-\u2eff\u31c0-\u31ef\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fbf\uf900-\ufaff]"

#remove_1 = re.sub(r"(")

'''
text2 = not_hangul.sub('', text) # 한글, 공백, 개행문자가 아닌 전부를 ''무로 만듦.
#print(text2)
#result = result.replace("\n", " ")

text2 = re.sub("\n", " <eop> ", text2)
words = text2.strip().split() # <eop>도 하나의 단어로 취급
print(len(words))  # 228061

vocab = list(set(words))
vocab_size = len(vocab) # 61486 # 3만개 이하는 <unk>
print("vocab_size = ", vocab_size)


# 삭제된 (한글 아닌) 단어들을 확인
result = not_hangul.findall(text)
special = list(set(result))  # 리스트
length = len(special)
print(special)
print("한글이 아닌 단어들의 갯수: ", length)

# (한자)[한자] 표현이 아닌 표현들의 개수
not_hanja = []
for w in special:
  if re.search("\(" + hanja_regex + "+\)", w) != None:
    continue
  if re.search("\[" + hanja_regex + "+\]", w) != None:
    continue

  try: 
    tmp = float(w)
    continue
  except:
    not_hanja.append(w)
    

print(not_hanja)
print("(한자), [한자], 숫자를 제외한 단어들의 갯수: ", len(not_hanja))



# 단어를 분리해서 단어 개수를 센다. 희소 단어(1~2개 출현)를 <unk>로 설정한다.
# 넘버를 N으로 설정한다.
# 시의 끝을 <eop>로 설정한다.
# 단어 사전, 인덱스 사전, 단어 빈도 사전을 만든다.

from collections import Counter

vocab = Counter(words)
#print(vocab)
#print(vocab['뜨거운'])  # 90

vocab_size = 31000   # 20750~ 20100 사이에 1~2로 변하는 지점 있음
vocab = vocab.most_common(vocab_size)
#print(vocab)

word_index = {}
index_word = {}

i = 0
for (word, frequency) in vocab:
  i += 1
  word_index[word] = i
  index_word[i] = word
#print(word_index)

vocab = dict(vocab)

# new_words = []
# for w in words:
#   if w not in vocab:
#     print(w, "========================")
#     new_words.append("<unk>")
#   else:
#     new_words.append(w)
    
text = ' '.join(words)
text = text.replace("<eop>", "\n")

with open(out_filename, 'w') as f:
  f.write(text)
  
  '''

앞 32번째 줄까지, 그러니까 not_hangul 이 나오기 전까지가 전처리한 코드이고, 그 이후 코드는 다른 방식으로 전처리를 시도한 흔적입니다.

시의 경우에는 괄호 안에, 대괄호 안에 한자를 쓰는 경향이 있는데

1) 괄호와 함께 한자를 모두 제거하고

2) 숫자의 경우, 자주 사용하는 숫자는 한글로 표기하고, 하나 둘 같은 갯수 세는 것은 열둘까지만 한글 표기, 그외엔 숫자 표기, 방 호실이나 무작위 번호 같은 것은 모두 'N'으로 표기하고,

1시 30분 => 한 시 삼십 분, 20대 => 이십 대, 37호실 => N 호실, B-206호 행성 => N 호 행성

3) 본문 내 영어 문자를 모두 공백(' ')으로 대체하고

4) 모든 구두점 문자를 제거한다. "/`『』「」…';:()\[\]<>\"--,―~Ⅱ∼·!－.*?" 제거

5) 시를 구분하는 문자를 'eop'(end of poem)로 넣고, 경우에 따라 '\n'로 대체한다. 그외 모든 개행문자('\n')를 제거한다. 즉, 시 한 수를 한줄에 나열한다. 단, 1절 2절 같은 절 표시를 가진 시들이 있는데 이 경우에는 절을 하나의 시로 간주해서 한 줄에 나열한다.

6) 괄호 안의 한글 표기는, 괄호를 제거해서 한글을 밖으로 내놓는다. 예) '나는 멍청해.(그렇다고 바보는 아니야)' => '나는 멍청해. 그렇다고 바보는 아니야'

7) 1월, 2월 => 일월, 이월, ....

8) 시를 입력할 때, 한글 표준 문법을 따르지 않는 단어들이 많은 경우, 해당 시를 입력(사용)하지 않는다. (주로 일제시대 이전 시, 문법/형식 파괴 시)

다음은 학습용 텍스트 파일(시퀀스)을 만드는 코드입니다. (전처리 코드 2)

import string
from konlpy.tag import Mecab

#import re

# 문서를 메모리로 불러오기
def load_doc(filename):
  # 파일을 읽기 모드로 열기
  with open(filename, 'r') as f:
    text = f.read()
  return text

#in_filename = 'republic_clean.txt'
in_filename = '7961.txt'

doc = load_doc(in_filename)
print(doc[:200])

# 데이터 전처리
# 1. '-' 를 공백으로 처리. 공백 기준으로 단어 분리할 때 유리
# 2. 공백 기준으로 단어 분리
# 3. 모든 문장부호를 제거하여 단어사전 크기를 줄인다. 예) ? , . '
# 4. 단독 구둣점 토큰을 제거하기 위해 알파벳이 아닌 토큰들을 단어를 삭제
# 5. 단어 사전 크기를 줄이기 위해 모두를 소문자로 바꿈

# 언어모델에서 단어 사전 크기는 아주 중요한 요소이다. 
# 단어 사전이 작으면 작은 모델을 낳고 작은 모델은 훈련을 빠르게 할 수 있다.

morphs = 1 # 형태소 단위 분해 

# 문서를 전처리해서 토큰으로 만든다
def clean_doc(doc):
  # '--' 를 ' '으로
  # doc = doc.replace('--', ' ')
  # doc = doc.replace("\n", " eop ")
  
  if morphs == 1:     # 형태소 기준 단어 분리
    mecab = Mecab()
    tokens = mecab.morphs(doc)
  else: 
    tokens = doc.split() # 공백 기준 단어 분리

  # 각 토큰에서 부호를 제거한다
  # table = str.maketrans('', '', string.punctuation)
  # tokens = [w.translate(table) for w in tokens]
  # # 알파벳이 아닌 잔여 토큰들을 제거한다 
  # tokens = [word for word in tokens if word.isalpha()]
  # # 소문자로 
  # tokens = [word.lower() for word in tokens]
  return tokens

# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print("Total Tokens: %d" % len(tokens))
print("Unique Tokens: %d" % len(set(tokens)))

# ---------공백으로 단어를 나눌 때 split()
# Total Tokens: 231602
# Unique Tokens: 61473
# 토탈 시퀀스: 231551

# --------- mecab.morphs()로 나눌 때
# 토큰 갯수: 404798 (\n을 eop로 바꾸지 않고 그대로 뒀을 경우)
# 중복 제거 토큰 갯수(형태소-어휘-갯수): 22757
# 토탈 시퀀스: 404747


# 51개씩의 시퀀스 만들기 
length = 50 + 1
sequences = []
for i in range(length, len(tokens)):  # (51, 118684)   51, 52, 53.... 118683
  # 토큰 시퀀스 고르기
  seq = tokens[i - length: i]  # tokens[0:51] [1:52] [2:53] [3:54] [118632:118683]... 51개 단어씩 118632~118682
  line = ' '.join(seq)
  # store
  sequences.append(line)

print("Total Sequences: %d" % len(sequences))

# save tokens to file, one dialog per line
def save_doc(lines, filename):
  data = '\n'.join(lines)
  with open (filename, 'w') as f:
    f.write(data)

# 시퀀스를 파일로 저장한다
# out_filename = 'republic_sequences.txt'
out_filename = 'poems_sequences.txt'
save_doc(sequences, out_filename)

제이슨 씨의 코드에서는 clean_doc()이라는 함수에서 전처리를 하는데, 저는 별도로 전처리를 했으므로 대부분을 주석 처리했습니다.

한글 문장을 띄어쓰기 기준으로 어휘를 나누면 어휘 수가 엄청나게 많아집니다. 가령, '바람이', '바람으로', '바람은', '바람을', '가을바람', '겨울바람', '바다바람' .. 이 모든 게 단독 어휘가 되어 아무런 공통점이 없게 되는데, 형태소 단위로 어휘를 나누어서 시도해볼 필요가 있습니다. 저는 mecab을 사용했습니다.

morphs = 1 # 형태소 단위 분해

morphs 값을 지정해서 형태소 단위로 분리하거나, 띄어쓰기 기준으로 어휘를 나눌 수도 있습니다.

if morphs == 1: # 형태소 기준 단어 분리
mecab = Mecab()
tokens = mecab.morphs(doc)
else:
tokens = doc.split() # 공백 기준 단어 분리

저는 한국어 시를 3084개를 입력했는데,
공백으로 단어를 나누었을 때 (split() 사용)

Total Tokens: 231,602
Unique Tokens: 61,473 (어휘 수)
토탈 시퀀스: 231,551 (학습용 토큰 수: 전체 토큰에서 51개를 뺀 개수)

mecab.morphs()를 이용해 형태소 단위로 나누었을 때는,

토큰 갯수: 404,798 ('\n'을 'eop'로 바꾸지 않고 그대로 뒀을 경우)
중복 제거 토큰 갯수(형태소-어휘-갯수): 22,757
토탈 시퀀스: 404,747

비교해보면, 띄어쓰기 기준으로 나누었을 때보다, 형태소 단위로 나누었을 때가 토큰 개수가 1.75배 많습니다.

그런데 중복을 제거한 형태소들 개수는 거의 1/3 로 줄어들었습니다.

머신 러닝에 한결 유리한 상황이 된 것이지요.

아래는 문장을 생성하는 코드입니다. 시퀀스 텍스트 파일에서 임의로 시퀀스 하나(51개의 단어로 구성됨)를 읽어서 그것을 입력해서 그 다음 단어를 예측하는 방식입니다. 별달리 수정을 하지 않았습니다. 서두의 os.environ... 하는 코드는, 텐서 플로에서의 잡다한 경고 메시지 출력을 줄이려고 넣은 것입니다.

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
from random import randint
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 문서를 메모리로 불러오기
def load_doc(filename):
  # 파일을 읽기 모드로 열기
  with open(filename, 'r') as f:
    text = f.read()
  return text

# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
  result = []
  in_text = seed_text
  # generate a fixed number of words
  for _ in range(n_words):
    # encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    # truncate sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
    # predict probabilities for each word
    yhat = model.predict_classes(encoded, verbose=0)
    # map predicted word index to word
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
        break
    # append to input
    in_text += ' ' + out_word
    result.append(out_word)
  return ' '.join(result)

# load cleaned text sequences
#in_filename = "republic_sequences.txt"
in_filename = "poems_sequences.txt"
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1  # 50

# load the model
model = load_model('model.h5')

# load the tokenizer
with open('tokenizer.pkl', 'rb') as f:
  tokenizer = pickle.load(f)

# select a seed text
seed_text = lines[randint(0, len(lines))]
print(seed_text + '\n')

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 100)
print(generated)


# 생성된 문장이 기존 문장과 얼마나 닮았는지 평가하는 (표절 평가) 코드를 고려할 것.
# 단어 네 개가 (예를 들면) 연속으로 같으면 표절 점수를 1을 올리고, 계속 한단어씩 shift 해가면서 
# 표절 점수를 계산한다. 
# 점수 합계를 검사 횟수와 전체 단어 개수로 평균을 낸다.
# 혹은 표절한 문장과 표절 횟수를 출력 (원 시의 앞뒤 단어 몇 개를 더 보여줌)

미친토끼의 가출일기

(자연어 처리/ word 기반 RNN) 한국어 시 짓기 - 1

티스토리툴바