[NLP] BERT๋กœ ์˜ํ™” ๋ฆฌ๋ทฐ ๋ถ„๋ฅ˜ํ•˜๊ธฐ

2023. 9. 20. 10:11ใ†ML&DL/NLP

๐Ÿค— Hugging Face

- Transformer๊ณผ ๊ฐ™์€ ๋‹ค์–‘ํ•œ ์ธ๊ณต์ง€๋Šฅ ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ๋„๋ก ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ์ œ๊ณตํ•˜๋Š” ๋ชจ๋“ˆ

- ๋Œ€ํ‘œ์ ์ธ ๋ชจ๋ธ๋กœ Transformer ๊ธฐ๋ฐ˜์ธ BERT, GPT ๋ชจ๋ธ ๋“ฑ์ด ์žˆ์Œ

 

๐Ÿ“Œ ๋ชฉํ‘œ

- ๋„ค์ด๋ฒ„ ์˜ํ™” ๋ฆฌ๋ทฐ๋ฅผ ๊ฐ์„ฑ(๊ธ์ •/๋ถ€์ •) ๋ถ„๋ฅ˜ํ•ด๋ณด๊ธฐ! 

 

๐Ÿ’ป ์‹ค์Šต ์ฝ”๋“œ

 

Hugging Face๊ฐ€ ์ œ๊ณตํ•˜๋Š” transformers ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์„ค์น˜

!pip install transformers

 

๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ๋ฐ ํŒจํ‚ค์ง€ import 

import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

 

train & test ๋ฐ์ดํ„ฐ ๋‹ค์šด / ํ™•์ธ

# train & test ๋ฐ์ดํ„ฐ ์„ค์น˜
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')
# ๋ฐ์ดํ„ฐ ํ™•์ธ
print('ํ›ˆ๋ จ์šฉ ๋ฆฌ๋ทฐ ๊ฐœ์ˆ˜ :',len(train_data)) # ํ›ˆ๋ จ์šฉ ๋ฆฌ๋ทฐ ๊ฐœ์ˆ˜ ์ถœ๋ ฅ
print('ํ…Œ์ŠคํŠธ์šฉ ๋ฆฌ๋ทฐ ๊ฐœ์ˆ˜ :',len(test_data)) # ํ…Œ์ŠคํŠธ์šฉ ๋ฆฌ๋ทฐ ๊ฐœ์ˆ˜ ์ถœ๋ ฅ
# ๋ฐ์ดํ„ฐ ํ™•์ธ 
train_data[:5] # ์ƒ์œ„ 5๊ฐœ ์ถœ๋ ฅ
test_data[:5] # ์ƒ์œ„ 5๊ฐœ ์ถœ๋ ฅ

train & test ์ „์ฒ˜๋ฆฌ

# ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ - ๊ฒฐ์ธก์น˜ ์ œ๊ฑฐ

train_data = train_data.dropna(how = 'any') # Null ๊ฐ’์ด ์กด์žฌํ•˜๋Š” ํ–‰ ์ œ๊ฑฐ
train_data = train_data.reset_index(drop=True)
print(train_data.isnull().values.any()) # Null ๊ฐ’์ด ์กด์žฌํ•˜๋Š”์ง€ ํ™•์ธ

test_data = test_data.dropna(how = 'any') # Null ๊ฐ’์ด ์กด์žฌํ•˜๋Š” ํ–‰ ์ œ๊ฑฐ
test_data = test_data.reset_index(drop=True)
print(test_data.isnull().values.any()) # Null ๊ฐ’์ด ์กด์žฌํ•˜๋Š”์ง€ ํ™•์ธ

train & test ํ† ํฐํ™”

# BERT ๋ชจ๋ธ์˜ ํ† ํฌ๋‚˜์ด์ € ๊ฐ€์ ธ์˜ค๊ธฐ
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
#ํ…์ŠคํŠธ ์ธ์ฝ”๋”ฉ
print(tokenizer.encode("๋ณด๋Š”๋‚ด๋‚ด ๊ทธ๋Œ€๋กœ ๋“ค์–ด๋งž๋Š” ์˜ˆ์ธก ์นด๋ฆฌ์Šค๋งˆ ์—†๋Š” ์•…์—ญ"))

#ํ…์ŠคํŠธ ํ† ํฐํ™”
print(tokenizer.tokenize("๋ณด๋Š”๋‚ด๋‚ด ๊ทธ๋Œ€๋กœ ๋“ค์–ด๋งž๋Š” ์˜ˆ์ธก ์นด๋ฆฌ์Šค๋งˆ ์—†๋Š” ์•…์—ญ"))

# ์ธ์ฝ”๋”ฉ๋œ ๋ฌธ์žฅ์„ ์›๋ž˜ ๋ฌธ์žฅ์œผ๋กœ ๋””์ฝ”๋”ฉ
tokenizer.decode(tokenizer.encode("๋ณด๋Š”๋‚ด๋‚ด ๊ทธ๋Œ€๋กœ ๋“ค์–ด๋งž๋Š” ์˜ˆ์ธก ์นด๋ฆฌ์Šค๋งˆ ์—†๋Š” ์•…์—ญ"))

Bert ๋ชจ๋ธ์— ์•Œ๋งž์€ ์ž…๋ ฅ๊ฐ’์œผ๋กœ ๋ฐ์ดํ„ฐ ๋ณ€ํ™˜ํ•ด์ฃผ๊ธฐ

def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):

    input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []

    for example, label in tqdm(zip(examples, labels), total=len(examples)):
        # input_id๋Š” ์›Œ๋“œ ์ž„๋ฒ ๋”ฉ์„ ์œ„ํ•œ ๋ฌธ์žฅ์˜ ์ •์ˆ˜ ์ธ์ฝ”๋”ฉ
        input_id = tokenizer.encode(example, max_length=max_seq_len, pad_to_max_length=True)

        # attention_mask๋Š” ์‹ค์ œ ๋‹จ์–ด๊ฐ€ ์œ„์น˜ํ•˜๋ฉด 1, ํŒจ๋”ฉ์˜ ์œ„์น˜์—๋Š” 0์ธ ์‹œํ€€์Šค.
        padding_count = input_id.count(tokenizer.pad_token_id)
        attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count

        # token_type_id๋Š” ์„ธ๊ทธ๋จผํŠธ ์ž„๋ฒ ๋”ฉ์„ ์œ„ํ•œ ๊ฒƒ์œผ๋กœ ์ด๋ฒˆ ์˜ˆ์ œ๋Š” ๋ฌธ์žฅ์ด 1๊ฐœ์ด๋ฏ€๋กœ ์ „๋ถ€ 0์œผ๋กœ ํ†ต์ผ.
        token_type_id = [0] * max_seq_len

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(label)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)

    data_labels = np.asarray(data_labels, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), data_labels

ํ˜•ํƒœ์— ๋งž๊ฒŒ ๋ณ€ํ™˜๋œ ๋ฐ์ดํ„ฐ๋กœ Train / Test ์ง€์ •

train_X, train_y = convert_examples_to_features(train_data['document'], train_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)
test_X, test_y = convert_examples_to_features(test_data['document'], test_data['label'], max_seq_len=max_seq_len, tokenizer=tokenizer)

๋ฐ์ดํ„ฐ ํ™•์ธ

# ์ตœ๋Œ€ ๊ธธ์ด: 128
input_id = train_X[0][0]
attention_mask = train_X[1][0]
token_type_id = train_X[2][0]
label = train_y[0]

print('๋‹จ์–ด์— ๋Œ€ํ•œ ์ •์ˆ˜ ์ธ์ฝ”๋”ฉ :',input_id)
print('์–ดํ…์…˜ ๋งˆ์Šคํฌ :',attention_mask)
print('์„ธ๊ทธ๋จผํŠธ ์ธ์ฝ”๋”ฉ :',token_type_id)
print('๊ฐ ์ธ์ฝ”๋”ฉ์˜ ๊ธธ์ด :', len(input_id))
print('์ •์ˆ˜ ์ธ์ฝ”๋”ฉ ๋ณต์› :',tokenizer.decode(input_id))
print('๋ ˆ์ด๋ธ” :',label)

pre-trained BERT ๋ชจ๋ธ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ

model = TFBertModel.from_pretrained("bert-base-multilingual-cased")

๋ชจ๋ธ ํ•™์Šต ์ „์— input ๊ฐ’ ์ •์˜ํ•ด์ฃผ๊ธฐ

max_seq_len = 128
input_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
attention_masks_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)
token_type_ids_layer = tf.keras.layers.Input(shape=(max_seq_len,), dtype=tf.int32)

outputs = model([input_ids_layer, attention_masks_layer, token_type_ids_layer])

๋ชจ๋ธ ์ •์˜

class TFBertForSequenceClassification(tf.keras.Model):
    def __init__(self, model_name):
        super(TFBertForSequenceClassification, self).__init__()
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.classifier = tf.keras.layers.Dense(1,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),
                                                activation='sigmoid',
                                                name='classifier')

    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls_token = outputs[1]
        prediction = self.classifier(cls_token)

        return prediction
with strategy.scope():
  model = TFBertForSequenceClassification("bert-base-multilingual-cased")
  optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
  loss = tf.keras.losses.BinaryCrossentropy()
  model.compile(optimizer=optimizer, loss=loss, metrics = ['accuracy'])

๋ชจ๋ธ ํ•™์Šต / ํ…Œ์ŠคํŠธ ๊ฒฐ๊ณผ

model.fit(train_X, train_y, epochs=2, batch_size=64, validation_split=0.2)
results = model.evaluate(test_X, test_y, batch_size=1024)
print("test loss, test acc: ", results)

 

๋ถ„๋ฅ˜ ๊ฒฐ๊ณผ ํ…Œ์ŠคํŠธ ํ•ด๋ณด๊ธฐ

def sentiment_predict(new_sentence):
  input_id = tokenizer.encode(new_sentence, max_length=max_seq_len, pad_to_max_length=True)

  padding_count = input_id.count(tokenizer.pad_token_id)
  attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
  token_type_id = [0] * max_seq_len

  input_ids = np.array([input_id])
  attention_masks = np.array([attention_mask])
  token_type_ids = np.array([token_type_id])

  encoded_input = [input_ids, attention_masks, token_type_ids]
  score = model.predict(encoded_input)[0][0]
  print(score)

  if(score > 0.5):
    print("{:.2f}% ํ™•๋ฅ ๋กœ ๊ธ์ • ๋ฆฌ๋ทฐ์ž…๋‹ˆ๋‹ค.\n".format(score * 100))
  else:
    print("{:.2f}% ํ™•๋ฅ ๋กœ ๋ถ€์ • ๋ฆฌ๋ทฐ์ž…๋‹ˆ๋‹ค.\n".format((1 - score) * 100))

๊ฒฐ๊ณผ

๋ถ€์ • ๋ฆฌ๋ทฐ

sentiment_predict("๋ณด๋˜๊ฑฐ๋ผ ๊ณ„์†๋ณด๊ณ ์žˆ๋Š”๋ฐ ์ „๊ฐœ๋„ ๋Š๋ฆฌ๊ณ  ์ฃผ์ธ๊ณต์ธ ์€ํฌ๋Š” ํ•œ๋‘์ปท ๋‚˜์˜ค๋ฉด์„œ ์†Œ๊ทน์ ์ธ๋ชจ์Šต์— ")

 

๊ธ์ • ๋ฆฌ๋ทฐ

sentiment_predict('์™€ ๊ฐœ์ฉ๋‹ค ์ •๋ง ์„ธ๊ณ„๊ด€ ์ตœ๊ฐ•์ž๋“ค์˜ ์˜ํ™”๋‹ค')

 

์ถœ์ฒ˜ : https://github.com/ukairia777/tensorflow-nlp-tutorial

 

GitHub - ukairia777/tensorflow-nlp-tutorial: tensorflow๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ๋ถ€ํ„ฐ, Topic Models, BERT, GPT์™€

tensorflow๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ๋ถ€ํ„ฐ, Topic Models, BERT, GPT์™€ ๊ฐ™์€ ์ตœ์‹  ๋ชจ๋ธ์˜ ๋‹ค์šด์ŠคํŠธ๋ฆผ ํƒœ์Šคํฌ๋“ค์„ ์ •๋ฆฌํ•œ Deep Learning NLP ์ €์žฅ์†Œ์ž…๋‹ˆ๋‹ค. - GitHub - ukairia777/tensorflow-nlp-tutorial: tensorflow๋ฅผ ์‚ฌ์šฉํ•˜

github.com