[NLP] BERT๋ก ์ํ ๋ฆฌ๋ทฐ ๋ถ๋ฅํ๊ธฐ
๐ค Hugging Face
- Transformer๊ณผ ๊ฐ์ ๋ค์ํ ์ธ๊ณต์ง๋ฅ ๋ชจ๋ธ์ ์ฌ์ฉํ ์ ์๋๋ก ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ฅผ ์ ๊ณตํ๋ ๋ชจ๋
- ๋ํ์ ์ธ ๋ชจ๋ธ๋ก Transformer ๊ธฐ๋ฐ์ธ BERT, GPT ๋ชจ๋ธ ๋ฑ์ด ์์
๐ ๋ชฉํ
- ๋ค์ด๋ฒ ์ํ ๋ฆฌ๋ทฐ๋ฅผ ๊ฐ์ฑ(๊ธ์ /๋ถ์ ) ๋ถ๋ฅํด๋ณด๊ธฐ!
๐ป ์ค์ต ์ฝ๋
Hugging Face๊ฐ ์ ๊ณตํ๋ transformers ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ค์น
!pip install transformers
๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ฐ ํจํค์ง import
import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
train & test ๋ฐ์ดํฐ ๋ค์ด / ํ์ธ
# train & test ๋ฐ์ดํฐ ์ค์น
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')
# ๋ฐ์ดํฐ ํ์ธ
print('ํ๋ จ์ฉ ๋ฆฌ๋ทฐ ๊ฐ์ :',len(train_data)) # ํ๋ จ์ฉ ๋ฆฌ๋ทฐ ๊ฐ์ ์ถ๋ ฅ
print('ํ
์คํธ์ฉ ๋ฆฌ๋ทฐ ๊ฐ์ :',len(test_data)) # ํ
์คํธ์ฉ ๋ฆฌ๋ทฐ ๊ฐ์ ์ถ๋ ฅ
# ๋ฐ์ดํฐ ํ์ธ
train_data[:5] # ์์ 5๊ฐ ์ถ๋ ฅ
test_data[:5] # ์์ 5๊ฐ ์ถ๋ ฅ
train & test ์ ์ฒ๋ฆฌ
# ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ - ๊ฒฐ์ธก์น ์ ๊ฑฐ
train_data = train_data.dropna(how = 'any') # Null ๊ฐ์ด ์กด์ฌํ๋ ํ ์ ๊ฑฐ
train_data = train_data.reset_index(drop=True)
print(train_data.isnull().values.any()) # Null ๊ฐ์ด ์กด์ฌํ๋์ง ํ์ธ
test_data = test_data.dropna(how = 'any') # Null ๊ฐ์ด ์กด์ฌํ๋ ํ ์ ๊ฑฐ
test_data = test_data.reset_index(drop=True)
print(test_data.isnull().values.any()) # Null ๊ฐ์ด ์กด์ฌํ๋์ง ํ์ธ
train & test ํ ํฐํ
# BERT ๋ชจ๋ธ์ ํ ํฌ๋์ด์ ๊ฐ์ ธ์ค๊ธฐ
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
#ํ
์คํธ ์ธ์ฝ๋ฉ
print(tokenizer.encode("๋ณด๋๋ด๋ด ๊ทธ๋๋ก ๋ค์ด๋ง๋ ์์ธก ์นด๋ฆฌ์ค๋ง ์๋ ์
์ญ"))
#ํ
์คํธ ํ ํฐํ
print(tokenizer.tokenize("๋ณด๋๋ด๋ด ๊ทธ๋๋ก ๋ค์ด๋ง๋ ์์ธก ์นด๋ฆฌ์ค๋ง ์๋ ์
์ญ"))
# ์ธ์ฝ๋ฉ๋ ๋ฌธ์ฅ์ ์๋ ๋ฌธ์ฅ์ผ๋ก ๋์ฝ๋ฉ
tokenizer.decode(tokenizer.encode("๋ณด๋๋ด๋ด ๊ทธ๋๋ก ๋ค์ด๋ง๋ ์์ธก ์นด๋ฆฌ์ค๋ง ์๋ ์
์ญ"))
Bert ๋ชจ๋ธ์ ์๋ง์ ์ ๋ ฅ๊ฐ์ผ๋ก ๋ฐ์ดํฐ ๋ณํํด์ฃผ๊ธฐ
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):
input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []
for example, label in tqdm(zip(examples, labels), total=len(examples)):
# input_id๋ ์๋ ์๋ฒ ๋ฉ์ ์ํ ๋ฌธ์ฅ์ ์ ์ ์ธ์ฝ๋ฉ
input_id = tokenizer.encode(example, max_length=max_seq_len, pad_to_max_length=True)
# attention_mask๋ ์ค์ ๋จ์ด๊ฐ ์์นํ๋ฉด 1, ํจ๋ฉ์ ์์น์๋ 0์ธ ์ํ์ค.
padding_count = input_id.count(tokenizer.pad_token_id)
attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
# token_type_id๋ ์ธ๊ทธ๋จผํธ ์๋ฒ ๋ฉ์ ์ํ ๊ฒ์ผ๋ก ์ด๋ฒ ์์ ๋ ๋ฌธ์ฅ์ด 1๊ฐ์ด๋ฏ๋ก ์ ๋ถ 0์ผ๋ก ํต์ผ.
token_type_id = [0] * max_seq_len
assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)
input_ids.append(input_id)
attention_masks.append(attention_mask)
token_type_ids.append(token_type_id)
data_labels.append(label)
input_ids = np.array(input_ids, dtype=int)
attention_masks = np.array(attention_masks, dtype=int)
token_type_ids = np.array(token_type_ids, dtype=int)
data_labels = np.asarray(data_labels, dtype=np.int32)
return (input_ids, attention_masks, token_type_ids), data_labels
ํํ์ ๋ง๊ฒ ๋ณํ๋ ๋ฐ์ดํฐ๋ก Train / Test ์ง์
train_X, train_y = convert_examples_to_features(train_data['document'], train_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)
test_X, test_y = convert_examples_to_features(test_data['document'], test_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)
๋ฐ์ดํฐ ํ์ธ
# ์ต๋ ๊ธธ์ด: 128
input_id = train_X[0][0]
attention_mask = train_X[1][0]
token_type_id = train_X[2][0]
label = train_y[0]
print('๋จ์ด์ ๋ํ ์ ์ ์ธ์ฝ๋ฉ :',input_id)
print('์ดํ
์
๋ง์คํฌ :',attention_mask)
print('์ธ๊ทธ๋จผํธ ์ธ์ฝ๋ฉ :',token_type_id)
print('๊ฐ ์ธ์ฝ๋ฉ์ ๊ธธ์ด :', len(input_id))
print('์ ์ ์ธ์ฝ๋ฉ ๋ณต์ :',tokenizer.decode(input_id))
print('๋ ์ด๋ธ :',label)
pre-trained BERT ๋ชจ๋ธ ๋ถ๋ฌ์ค๊ธฐ
model = TFBertModel.from_pretrained("bert-base-multilingual-cased")
๋ชจ๋ธ ํ์ต ์ ์ input ๊ฐ ์ ์ํด์ฃผ๊ธฐ
max_seq_len = 128
input_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
attention_masks_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
token_type_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
outputs = model([input_ids_layer, attention_masks_layer, token_type_ids_layer])
๋ชจ๋ธ ์ ์
class TFBertForSequenceClassification(tf.keras.Model):
def __init__(self, model_name):
super(TFBertForSequenceClassification, self).__init__()
self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
self.classifier = tf.keras.layers.Dense(1,
kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
activation='sigmoid',
name='classifier')
def call(self, inputs):
input_ids, attention_mask, token_type_ids = inputs
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
cls_token = outputs[1]
prediction = self.classifier(cls_token)
return prediction
with strategy.scope():
model = TFBertForSequenceClassification("bert-base-multilingual-cased")
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer=optimizer, loss=loss, metrics = ['accuracy'])
๋ชจ๋ธ ํ์ต / ํ ์คํธ ๊ฒฐ๊ณผ
model.fit(train_X, train_y, epochs=2, batch_size=64, validation_split=0.2)
results = model.evaluate(test_X, test_y, batch_size=1024)
print("test loss, test acc: ", results)
๋ถ๋ฅ ๊ฒฐ๊ณผ ํ ์คํธ ํด๋ณด๊ธฐ
def sentiment_predict(new_sentence):
input_id = tokenizer.encode(new_sentence, max_length=max_seq_len, pad_to_max_length=True)
padding_count = input_id.count(tokenizer.pad_token_id)
attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
token_type_id = [0] * max_seq_len
input_ids = np.array([input_id])
attention_masks = np.array([attention_mask])
token_type_ids = np.array([token_type_id])
encoded_input = [input_ids, attention_masks, token_type_ids]
score = model.predict(encoded_input)[0][0]
print(score)
if(score > 0.5):
print("{:.2f}% ํ๋ฅ ๋ก ๊ธ์ ๋ฆฌ๋ทฐ์
๋๋ค.\n".format(score * 100))
else:
print("{:.2f}% ํ๋ฅ ๋ก ๋ถ์ ๋ฆฌ๋ทฐ์
๋๋ค.\n".format((1 - score) * 100))
๊ฒฐ๊ณผ
๋ถ์ ๋ฆฌ๋ทฐ
sentiment_predict("๋ณด๋๊ฑฐ๋ผ ๊ณ์๋ณด๊ณ ์๋๋ฐ ์ ๊ฐ๋ ๋๋ฆฌ๊ณ ์ฃผ์ธ๊ณต์ธ ์ํฌ๋ ํ๋์ปท ๋์ค๋ฉด์ ์๊ทน์ ์ธ๋ชจ์ต์ ")
๊ธ์ ๋ฆฌ๋ทฐ
sentiment_predict('์ ๊ฐ์ฉ๋ค ์ ๋ง ์ธ๊ณ๊ด ์ต๊ฐ์๋ค์ ์ํ๋ค')
์ถ์ฒ : https://github.com/ukairia777/tensorflow-nlp-tutorial
GitHub - ukairia777/tensorflow-nlp-tutorial: tensorflow๋ฅผ ์ฌ์ฉํ์ฌ ํ ์คํธ ์ ์ฒ๋ฆฌ๋ถํฐ, Topic Models, BERT, GPT์
tensorflow๋ฅผ ์ฌ์ฉํ์ฌ ํ ์คํธ ์ ์ฒ๋ฆฌ๋ถํฐ, Topic Models, BERT, GPT์ ๊ฐ์ ์ต์ ๋ชจ๋ธ์ ๋ค์ด์คํธ๋ฆผ ํ์คํฌ๋ค์ ์ ๋ฆฌํ Deep Learning NLP ์ ์ฅ์์ ๋๋ค. - GitHub - ukairia777/tensorflow-nlp-tutorial: tensorflow๋ฅผ ์ฌ์ฉํ
github.com