2023. 9. 20. 10:11ใML&DL/NLP
๐ค Hugging Face
- Transformer๊ณผ ๊ฐ์ ๋ค์ํ ์ธ๊ณต์ง๋ฅ ๋ชจ๋ธ์ ์ฌ์ฉํ ์ ์๋๋ก ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ฅผ ์ ๊ณตํ๋ ๋ชจ๋
- ๋ํ์ ์ธ ๋ชจ๋ธ๋ก Transformer ๊ธฐ๋ฐ์ธ BERT, GPT ๋ชจ๋ธ ๋ฑ์ด ์์
๐ ๋ชฉํ
- ๋ค์ด๋ฒ ์ํ ๋ฆฌ๋ทฐ๋ฅผ ๊ฐ์ฑ(๊ธ์ /๋ถ์ ) ๋ถ๋ฅํด๋ณด๊ธฐ!
๐ป ์ค์ต ์ฝ๋
Hugging Face๊ฐ ์ ๊ณตํ๋ transformers ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ค์น
!pip install transformers
๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ฐ ํจํค์ง import
import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
train & test ๋ฐ์ดํฐ ๋ค์ด / ํ์ธ
# train & test ๋ฐ์ดํฐ ์ค์น
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')
# ๋ฐ์ดํฐ ํ์ธ
print('ํ๋ จ์ฉ ๋ฆฌ๋ทฐ ๊ฐ์ :',len(train_data)) # ํ๋ จ์ฉ ๋ฆฌ๋ทฐ ๊ฐ์ ์ถ๋ ฅ
print('ํ
์คํธ์ฉ ๋ฆฌ๋ทฐ ๊ฐ์ :',len(test_data)) # ํ
์คํธ์ฉ ๋ฆฌ๋ทฐ ๊ฐ์ ์ถ๋ ฅ
# ๋ฐ์ดํฐ ํ์ธ
train_data[:5] # ์์ 5๊ฐ ์ถ๋ ฅ
test_data[:5] # ์์ 5๊ฐ ์ถ๋ ฅ
train & test ์ ์ฒ๋ฆฌ
# ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ - ๊ฒฐ์ธก์น ์ ๊ฑฐ
train_data = train_data.dropna(how = 'any') # Null ๊ฐ์ด ์กด์ฌํ๋ ํ ์ ๊ฑฐ
train_data = train_data.reset_index(drop=True)
print(train_data.isnull().values.any()) # Null ๊ฐ์ด ์กด์ฌํ๋์ง ํ์ธ
test_data = test_data.dropna(how = 'any') # Null ๊ฐ์ด ์กด์ฌํ๋ ํ ์ ๊ฑฐ
test_data = test_data.reset_index(drop=True)
print(test_data.isnull().values.any()) # Null ๊ฐ์ด ์กด์ฌํ๋์ง ํ์ธ
train & test ํ ํฐํ
# BERT ๋ชจ๋ธ์ ํ ํฌ๋์ด์ ๊ฐ์ ธ์ค๊ธฐ
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
#ํ
์คํธ ์ธ์ฝ๋ฉ
print(tokenizer.encode("๋ณด๋๋ด๋ด ๊ทธ๋๋ก ๋ค์ด๋ง๋ ์์ธก ์นด๋ฆฌ์ค๋ง ์๋ ์
์ญ"))
#ํ
์คํธ ํ ํฐํ
print(tokenizer.tokenize("๋ณด๋๋ด๋ด ๊ทธ๋๋ก ๋ค์ด๋ง๋ ์์ธก ์นด๋ฆฌ์ค๋ง ์๋ ์
์ญ"))
# ์ธ์ฝ๋ฉ๋ ๋ฌธ์ฅ์ ์๋ ๋ฌธ์ฅ์ผ๋ก ๋์ฝ๋ฉ
tokenizer.decode(tokenizer.encode("๋ณด๋๋ด๋ด ๊ทธ๋๋ก ๋ค์ด๋ง๋ ์์ธก ์นด๋ฆฌ์ค๋ง ์๋ ์
์ญ"))
Bert ๋ชจ๋ธ์ ์๋ง์ ์ ๋ ฅ๊ฐ์ผ๋ก ๋ฐ์ดํฐ ๋ณํํด์ฃผ๊ธฐ
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):
input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []
for example, label in tqdm(zip(examples, labels), total=len(examples)):
# input_id๋ ์๋ ์๋ฒ ๋ฉ์ ์ํ ๋ฌธ์ฅ์ ์ ์ ์ธ์ฝ๋ฉ
input_id = tokenizer.encode(example, max_length=max_seq_len, pad_to_max_length=True)
# attention_mask๋ ์ค์ ๋จ์ด๊ฐ ์์นํ๋ฉด 1, ํจ๋ฉ์ ์์น์๋ 0์ธ ์ํ์ค.
padding_count = input_id.count(tokenizer.pad_token_id)
attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
# token_type_id๋ ์ธ๊ทธ๋จผํธ ์๋ฒ ๋ฉ์ ์ํ ๊ฒ์ผ๋ก ์ด๋ฒ ์์ ๋ ๋ฌธ์ฅ์ด 1๊ฐ์ด๋ฏ๋ก ์ ๋ถ 0์ผ๋ก ํต์ผ.
token_type_id = [0] * max_seq_len
assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)
input_ids.append(input_id)
attention_masks.append(attention_mask)
token_type_ids.append(token_type_id)
data_labels.append(label)
input_ids = np.array(input_ids, dtype=int)
attention_masks = np.array(attention_masks, dtype=int)
token_type_ids = np.array(token_type_ids, dtype=int)
data_labels = np.asarray(data_labels, dtype=np.int32)
return (input_ids, attention_masks, token_type_ids), data_labels
ํํ์ ๋ง๊ฒ ๋ณํ๋ ๋ฐ์ดํฐ๋ก Train / Test ์ง์
train_X, train_y = convert_examples_to_features(train_data['document'], train_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)
test_X, test_y = convert_examples_to_features(test_data['document'], test_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)
๋ฐ์ดํฐ ํ์ธ
# ์ต๋ ๊ธธ์ด: 128
input_id = train_X[0][0]
attention_mask = train_X[1][0]
token_type_id = train_X[2][0]
label = train_y[0]
print('๋จ์ด์ ๋ํ ์ ์ ์ธ์ฝ๋ฉ :',input_id)
print('์ดํ
์
๋ง์คํฌ :',attention_mask)
print('์ธ๊ทธ๋จผํธ ์ธ์ฝ๋ฉ :',token_type_id)
print('๊ฐ ์ธ์ฝ๋ฉ์ ๊ธธ์ด :', len(input_id))
print('์ ์ ์ธ์ฝ๋ฉ ๋ณต์ :',tokenizer.decode(input_id))
print('๋ ์ด๋ธ :',label)
pre-trained BERT ๋ชจ๋ธ ๋ถ๋ฌ์ค๊ธฐ
model = TFBertModel.from_pretrained("bert-base-multilingual-cased")
๋ชจ๋ธ ํ์ต ์ ์ input ๊ฐ ์ ์ํด์ฃผ๊ธฐ
max_seq_len = 128
input_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
attention_masks_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
token_type_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
outputs = model([input_ids_layer, attention_masks_layer, token_type_ids_layer])
๋ชจ๋ธ ์ ์
class TFBertForSequenceClassification(tf.keras.Model):
def __init__(self, model_name):
super(TFBertForSequenceClassification, self).__init__()
self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
self.classifier = tf.keras.layers.Dense(1,
kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
activation='sigmoid',
name='classifier')
def call(self, inputs):
input_ids, attention_mask, token_type_ids = inputs
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
cls_token = outputs[1]
prediction = self.classifier(cls_token)
return prediction
with strategy.scope():
model = TFBertForSequenceClassification("bert-base-multilingual-cased")
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer=optimizer, loss=loss, metrics = ['accuracy'])
๋ชจ๋ธ ํ์ต / ํ ์คํธ ๊ฒฐ๊ณผ
model.fit(train_X, train_y, epochs=2, batch_size=64, validation_split=0.2)
results = model.evaluate(test_X, test_y, batch_size=1024)
print("test loss, test acc: ", results)
๋ถ๋ฅ ๊ฒฐ๊ณผ ํ ์คํธ ํด๋ณด๊ธฐ
def sentiment_predict(new_sentence):
input_id = tokenizer.encode(new_sentence, max_length=max_seq_len, pad_to_max_length=True)
padding_count = input_id.count(tokenizer.pad_token_id)
attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
token_type_id = [0] * max_seq_len
input_ids = np.array([input_id])
attention_masks = np.array([attention_mask])
token_type_ids = np.array([token_type_id])
encoded_input = [input_ids, attention_masks, token_type_ids]
score = model.predict(encoded_input)[0][0]
print(score)
if(score > 0.5):
print("{:.2f}% ํ๋ฅ ๋ก ๊ธ์ ๋ฆฌ๋ทฐ์
๋๋ค.\n".format(score * 100))
else:
print("{:.2f}% ํ๋ฅ ๋ก ๋ถ์ ๋ฆฌ๋ทฐ์
๋๋ค.\n".format((1 - score) * 100))
๊ฒฐ๊ณผ
๋ถ์ ๋ฆฌ๋ทฐ
sentiment_predict("๋ณด๋๊ฑฐ๋ผ ๊ณ์๋ณด๊ณ ์๋๋ฐ ์ ๊ฐ๋ ๋๋ฆฌ๊ณ ์ฃผ์ธ๊ณต์ธ ์ํฌ๋ ํ๋์ปท ๋์ค๋ฉด์ ์๊ทน์ ์ธ๋ชจ์ต์ ")

๊ธ์ ๋ฆฌ๋ทฐ
sentiment_predict('์ ๊ฐ์ฉ๋ค ์ ๋ง ์ธ๊ณ๊ด ์ต๊ฐ์๋ค์ ์ํ๋ค')

์ถ์ฒ : https://github.com/ukairia777/tensorflow-nlp-tutorial
GitHub - ukairia777/tensorflow-nlp-tutorial: tensorflow๋ฅผ ์ฌ์ฉํ์ฌ ํ ์คํธ ์ ์ฒ๋ฆฌ๋ถํฐ, Topic Models, BERT, GPT์
tensorflow๋ฅผ ์ฌ์ฉํ์ฌ ํ ์คํธ ์ ์ฒ๋ฆฌ๋ถํฐ, Topic Models, BERT, GPT์ ๊ฐ์ ์ต์ ๋ชจ๋ธ์ ๋ค์ด์คํธ๋ฆผ ํ์คํฌ๋ค์ ์ ๋ฆฌํ Deep Learning NLP ์ ์ฅ์์ ๋๋ค. - GitHub - ukairia777/tensorflow-nlp-tutorial: tensorflow๋ฅผ ์ฌ์ฉํ
github.com

'ML&DL > NLP' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[NLP] ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ - ์ ์ / ์ ๊ทํ (0) | 2023.09.27 |
---|---|
[NLP] ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ - ์์ด/ ํ๊ตญ์ด ํ ํฐํ ์ค์ต (0) | 2023.09.21 |
[NLP] ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ - ํ๊ตญ์ด ํ ํฐํ (0) | 2023.09.21 |
[NLP] Python ํ๊ธ ๋ง์ถค๋ฒ ๊ฒ์ฌ ๋ผ์ด๋ธ๋ฌ๋ฆฌ (0) | 2023.09.21 |
์์ฐ์ด ์ฒ๋ฆฌ(NLP) (1) (0) | 2023.09.20 |