ebrukilic/tubitak_clothing_absa_v3
Viewer • Updated • 12.4k • 48 • 1
How to use ebrukilic/bert-absa-tr-v4 with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text-classification", model="ebrukilic/bert-absa-tr-v4") # Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("ebrukilic/bert-absa-tr-v4")
model = AutoModelForSequenceClassification.from_pretrained("ebrukilic/bert-absa-tr-v4")Bu model, Türkçe ürün yorumları üzerinde Aspect-Based Sentiment Analysis (ABSA) yapmak için fine-tune edilmiştir.
ebrukilic/tubitak_clothing_absa_v3["negatif", "nötr", "pozitif"] Bu model, ürün yorumlarını otomatik olarak aspect (beden, kalite, fiyat, renk, kargo, kumaş) ve duygu (pozitif, negatif, nötr) olarak sınıflandırır.
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pandas as pd
from collections import defaultdict
# ------------------- Sabitler -------------------
MODEL_ID = "ebrukilic/bert-absa-tr-v4"
DATASET_ID = "ebrukilic/tubitak_clothing_absa_v3"
SPLIT = "test"
TEXT_COL = "normalized_yorum"
LABEL_COL = "polarity"
ASPECT_COL = "aspects"
BATCH_SIZE = 16
MAX_LEN = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
# ---------- Dataset yükleme işlemi ----------------
ds_raw = load_dataset(DATASET_ID, split=SPLIT)
df = ds_raw.to_pandas()
# Aspect listelerini temizle
def to_list(x):
if x is None: return []
if isinstance(x, list): return x
return [x]
def clean_list(lst):
return [str(a) for a in lst if str(a).lower() not in {"unknown", "unk", ""}]
df["_aspect_list"] = df[ASPECT_COL].apply(to_list).apply(clean_list)
df = df[df["_aspect_list"].map(len) > 0].copy()
# Label encode
label_space = sorted(list(set(map(str, df[LABEL_COL]))))
label2id = {label:i for i,label in enumerate(label_space)}
id2label = {i:label for label,i in label2id.items()}
print("Label mapping:", label2id, "\n")
# ------------------- Model yükle -------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_ID, num_labels=len(label_space), id2label=id2label, label2id=label2id
).to(device)
model.eval()
# Aspect-aware dataset
class AspectAwareDataset(Dataset):
def __init__(self, dataframe, tokenizer, label2id, max_length=128):
self.data = dataframe.explode("_aspect_list").rename(columns={"_aspect_list":"aspect"}).reset_index(drop=True)
self.tokenizer = tokenizer
self.label2id = label2id
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
row = self.data.iloc[idx]
text, aspect = row[TEXT_COL], row["aspect"] #hem text hem aspect verildi
label = self.label2id[str(row[LABEL_COL])]
inputs = self.tokenizer(
aspect, text,
truncation=True,
padding='max_length',
max_length=self.max_length,
return_tensors="pt"
)
return {
'input_ids': inputs['input_ids'].squeeze(0),
'attention_mask': inputs['attention_mask'].squeeze(0),
'labels': torch.tensor(label)
}
# ------------------- DataLoader -------------------
dataloader_aspect = DataLoader(AspectAwareDataset(df, tokenizer, label2id, MAX_LEN),
batch_size=BATCH_SIZE, shuffle=False)
# ------------------- Değerlendirme Fonksiyonu -------------------
def evaluate_model(dataloader, device, df):
y_true, y_pred = [], []
aspect_perf = defaultdict(list)
with torch.no_grad():
for idx, batch in enumerate(dataloader):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
predictions = torch.argmax(outputs.logits, dim=1)
y_true.extend(labels.cpu().tolist())
y_pred.extend(predictions.cpu().tolist())
batch_start = idx * BATCH_SIZE
for i in range(len(labels)):
data_idx = batch_start + i
if data_idx < len(df):
true_label = labels[i].cpu().item()
pred_label = predictions[i].cpu().item()
aspects = df.iloc[data_idx]["_aspect_list"]
for aspect in aspects:
aspect_perf[aspect].append((true_label, pred_label))
return y_true, y_pred, aspect_perf
# ------------------- Modeli Değerlendir -------------------
print("=== Aspect-aware Evaluation ===")
y_true, y_pred, aspect_perf = evaluate_model(dataloader_aspect, device, df)
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f} Macro-F1: {f1_score(y_true, y_pred, average='macro'):.4f}")
print(classification_report(y_true, y_pred, target_names=label_space))
| Label | Precision | Recall | F1-Score | Support |
|---|---|---|---|---|
| negatif | 0.72 | 0.90 | 0.80 | 1719 |
| nötr | 0.36 | 0.25 | 0.29 | 653 |
| pozitif | 0.86 | 0.74 | 0.79 | 1639 |
| macro avg | 0.64 | 0.63 | 0.63 | 4011 |
| weighted avg | 0.72 | 0.73 | 0.71 | 4011 |
| Label | Precision | Recall | F1-Score | Support |
|---|---|---|---|---|
| negatif | 0.64 | 0.75 | 0.69 | 1719 |
| nötr | 0.19 | 0.25 | 0.21 | 653 |
| pozitif | 0.79 | 0.55 | 0.65 | 1639 |
| macro avg | 0.54 | 0.52 | 0.52 | 4011 |
| weighted avg | 0.63 | 0.59 | 0.60 | 4011 |
Base model
dbmdz/bert-base-turkish-cased