import pandas as pd import numpy as np import re import warnings warnings.filterwarnings('ignore') # Import libraries untuk NLP try: from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig from transformers import T5Tokenizer, T5ForConditionalGeneration from sentence_transformers import SentenceTransformer from keybert import KeyBERT import torch except ImportError as e: print(f"Error importing NLP libraries: {e}") class MediaAnalyzer: def __init__(self): self.tokenizer = None self.bert_model = None self.embedding_model = None self.emotion_tokenizer = None self.emotion_model = None self.emotion_config = None self.summarization_tokenizer = None self.summarization_model = None self.ner_analyzer = None self.keyword_extractor = None self.models_loaded = False self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Emotion labels untuk model Indonesia self.emotion_labels = [ 'senang', 'sedih', 'marah', 'takut', 'jijik', 'terkejut' ] def load_models(self): """Load semua model NLP yang diperlukan""" try: print("Loading NLP models...") print(f"Using device: {self.device}") # Model BERT Indonesia untuk tokenization dan embedding model_name = 'cahya/bert-base-indonesian-1.5G' print(f"Loading BERT model: {model_name}") self.tokenizer = BertTokenizer.from_pretrained(model_name) self.bert_model = BertModel.from_pretrained(model_name) self.bert_model.to(self.device) self.bert_model.eval() # Set ke evaluation mode # Model Emotion Classification Indonesia try: print("Loading Indonesian emotion classification model...") self.emotion_tokenizer = BertTokenizer.from_pretrained("thoriqfy/indobert-emotion-classification") self.emotion_config = BertConfig.from_pretrained("thoriqfy/indobert-emotion-classification") self.emotion_model = BertForSequenceClassification.from_pretrained( "thoriqfy/indobert-emotion-classification", config=self.emotion_config ) self.emotion_model.to(self.device) self.emotion_model.eval() print("✅ Indonesian emotion classification model loaded successfully") except Exception as e: print(f"❌ Indonesian emotion classification model failed to load: {e}") self.emotion_tokenizer = None self.emotion_model = None # Model Summarization Indonesia T5 try: print("Loading Indonesian T5 summarization model...") self.summarization_tokenizer = T5Tokenizer.from_pretrained("cahya/t5-base-indonesian-summarization-cased") self.summarization_model = T5ForConditionalGeneration.from_pretrained("cahya/t5-base-indonesian-summarization-cased") self.summarization_model.to(self.device) self.summarization_model.eval() print("✅ Indonesian T5 summarization model loaded successfully") except Exception as e: print(f"❌ Indonesian T5 summarization model failed to load: {e}") self.summarization_tokenizer = None self.summarization_model = None # Model embedding alternatif (jika BERT utama gagal) try: self.embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') if torch.cuda.is_available(): self.embedding_model = self.embedding_model.to(self.device) except Exception as e: print(f"SentenceTransformer failed: {e}") self.embedding_model = None # NER dengan model Indonesia try: from transformers import pipeline self.ner_analyzer = pipeline( "ner", model="indolem/indobert-base-uncased-ner", tokenizer="indolem/indobert-base-uncased-ner", aggregation_strategy="simple", device=0 if torch.cuda.is_available() else -1 ) except Exception as e: print(f"NER analyzer failed to load: {e}") self.ner_analyzer = None # Keyword extraction dengan KeyBERT menggunakan BERT Indonesia try: # Gunakan BERT Indonesia untuk KeyBERT self.keyword_extractor = KeyBERT(model='cahya/bert-base-indonesian-1.5G') except Exception as e: print(f"KeyBERT with IndoBERT failed: {e}, using default") try: self.keyword_extractor = KeyBERT() except: self.keyword_extractor = None self.models_loaded = True print("All models loaded successfully!") except Exception as e: print(f"Error loading models: {e}") self.models_loaded = False def clean_text(self, text): """Cleaning text dasar""" if pd.isna(text): return "" text = str(text) # Remove URLs text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove mentions dan hashtags text = re.sub(r'@\w+|#\w+', '', text) # Remove karakter khusus text = re.sub(r'[^\w\s.,!?]', ' ', text) # Remove extra whitespace text = re.sub(r'\s+', ' ', text).strip() # Remove angka text = re.sub(r'\d+', '', text) return text def preprocess_text(self, texts): """Preprocessing batch text dengan tokenizer BERT Indonesia""" cleaned_texts = [self.clean_text(text) for text in texts] # Tokenization menggunakan BERT tokenizer if self.models_loaded and self.tokenizer is not None: try: tokenized_texts = [] for text in cleaned_texts: if text.strip(): tokens = self.tokenizer.tokenize(text) tokenized_texts.append(" ".join(tokens)) else: tokenized_texts.append("") return tokenized_texts except Exception as e: print(f"Tokenization error: {e}") return cleaned_texts else: return cleaned_texts def create_embeddings(self, texts): """Membuat embedding menggunakan BERT Indonesia dengan PyTorch""" if not self.models_loaded or self.bert_model is None: return None try: # Method 1: Gunakan SentenceTransformer jika tersedia if self.embedding_model is not None: embeddings = self.embedding_model.encode(texts) return embeddings # Method 2: Gunakan BERT Indonesia langsung dengan PyTorch embeddings = [] with torch.no_grad(): # Nonaktifkan gradient calculation untuk inference for text in texts: if text.strip(): # Tokenize dan encode text encoded_input = self.tokenizer( text, return_tensors='pt', max_length=512, truncation=True, padding=True ) encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()} # Dapatkan embeddings dari BERT outputs = self.bert_model(**encoded_input) # Gunakan [CLS] token embedding sebagai representasi dokumen cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy() embeddings.append(cls_embedding[0]) else: embeddings.append(np.zeros(768)) # Default size untuk BERT base return np.array(embeddings) except Exception as e: print(f"Error creating embeddings: {e}") return None def analyze_emotion(self, texts): """Analisis emosi untuk kumpulan teks menggunakan model Indonesia""" if not self.models_loaded or self.emotion_model is None: # Fallback emotion analysis menggunakan keyword-based approach return self._fallback_emotion_analysis(texts) try: emotions = [] for text in texts: if len(text.strip()) > 10: # Minimal panjang teks try: # Tokenize text inputs = self.emotion_tokenizer( text[:512], # Truncate untuk model return_tensors='pt', max_length=512, truncation=True, padding=True ) inputs = {k: v.to(self.device) for k, v in inputs.items()} # Predict emotion with torch.no_grad(): outputs = self.emotion_model(**inputs) predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) # Get predicted emotion and confidence confidence, predicted_class = torch.max(predictions, dim=1) emotion_idx = predicted_class.cpu().item() confidence_score = confidence.cpu().item() # Map to emotion label if emotion_idx < len(self.emotion_labels): emotion_label = self.emotion_labels[emotion_idx] else: emotion_label = 'netral' emotions.append({ 'emotion': emotion_label, 'confidence': round(confidence_score, 3), 'all_scores': predictions.cpu().numpy()[0] }) except Exception as e: print(f"Error in emotion prediction for text: {e}") emotions.append({ 'emotion': 'netral', 'confidence': 0.0, 'all_scores': None }) else: emotions.append({ 'emotion': 'netral', 'confidence': 0.0, 'all_scores': None }) return emotions except Exception as e: print(f"Error in emotion analysis: {e}") return self._fallback_emotion_analysis(texts) def _fallback_emotion_analysis(self, texts): """Fallback emotion analysis menggunakan keyword matching""" emotion_keywords = { 'senang': ['senang', 'bahagia', 'gembira', 'suka', 'senang', 'bangga', 'puas', 'legah', 'sukacita'], 'sedih': ['sedih', 'duka', 'pilu', 'kecewa', 'menyesal', 'haru', 'murung', 'nestapa', 'duka'], 'marah': ['marah', 'jengkel', 'kesal', 'geram', 'benci', 'dendam', 'gemas', 'berang', 'naik darah'], 'takut': ['takut', 'khawatir', 'cemas', 'waswas', 'ngeri', 'gentar', 'tremor', 'panik', 'was-was'], 'jijik': ['jijik', 'muak', 'mual', 'enggan', 'menjijikkan', 'kotor', 'kumuh'], 'terkejut': ['terkejut', 'kaget', 'heran', 'takjub', 'terperanjat', 'mengagetkan'], 'netral': ['netral', 'biasa', 'wajar', 'normal', 'lazim', 'umum'] } emotions = [] for text in texts: text_lower = text.lower() emotion_scores = {emotion: 0 for emotion in emotion_keywords.keys()} for emotion, keywords in emotion_keywords.items(): for keyword in keywords: if keyword in text_lower: emotion_scores[emotion] += 1 # Pilih emotion dengan score tertinggi max_score = max(emotion_scores.values()) if max_score > 0: dominant_emotion = max(emotion_scores.items(), key=lambda x: x[1])[0] confidence = emotion_scores[dominant_emotion] / sum(emotion_scores.values()) else: dominant_emotion = 'netral' confidence = 0.0 emotions.append({ 'emotion': dominant_emotion, 'confidence': round(confidence, 3), 'all_scores': None }) return emotions def extract_keywords(self, texts, top_n=5): """Ekstraksi keyword dari teks menggunakan KeyBERT dengan BERT Indonesia""" if not self.models_loaded or self.keyword_extractor is None: return self._fallback_keyword_extraction(texts, top_n) try: all_keywords = [] for text in texts: if len(text.strip()) > 20: keywords = self.keyword_extractor.extract_keywords( text, keyphrase_ngram_range=(1, 2), stop_words=None, top_n=top_n, diversity=0.5 ) all_keywords.append([kw[0] for kw in keywords]) else: all_keywords.append([]) return all_keywords except Exception as e: print(f"Error in keyword extraction: {e}") return self._fallback_keyword_extraction(texts, top_n) def _fallback_keyword_extraction(self, texts, top_n=5): """Fallback keyword extraction menggunakan TF-IDF sederhana""" from collections import Counter import re # Indonesian stopwords stopwords = { 'yang', 'dan', 'di', 'ke', 'dari', 'untuk', 'pada', 'dengan', 'ini', 'itu', 'ada', 'tidak', 'dalam', 'akan', 'atau', 'juga', 'saya', 'kamu', 'kami', 'kita', 'mereka', 'adalah', 'sudah', 'belum', 'telah', 'dapat', 'bisa', 'boleh', 'harus', 'perlu', 'lagi', 'saja', 'hanya', 'sangat', 'sekali', 'lebih', 'paling', 'sementara' } all_keywords = [] for text in texts: if len(text.strip()) > 10: # Tokenize sederhana words = re.findall(r'\b\w+\b', text.lower()) # Hapus stopwords words = [word for word in words if word not in stopwords and len(word) > 2] # Ambil kata paling umum word_counts = Counter(words) keywords = [word for word, count in word_counts.most_common(top_n)] all_keywords.append(keywords) else: all_keywords.append([]) return all_keywords def summarize_text(self, texts, max_length=100, min_length=30): """Summarization teks menggunakan model T5 Indonesia""" if not self.models_loaded or self.summarization_model is None: return self._fallback_summarization(texts, max_length) try: summaries = [] for text in texts: if len(text.strip()) > 100: try: # Preprocess text untuk summarization cleaned_text = self.clean_text(text) # Encode text input_ids = self.summarization_tokenizer.encode( cleaned_text, return_tensors='pt', max_length=512, truncation=True ).to(self.device) # Generate summary dengan parameter yang dioptimalkan summary_ids = self.summarization_model.generate( input_ids, min_length=min_length, max_length=max_length, num_beams=8, repetition_penalty=2.5, length_penalty=1.0, early_stopping=True, no_repeat_ngram_size=2, use_cache=True, do_sample=True, temperature=0.8, top_k=50, top_p=0.95 ) # Decode summary summary_text = self.summarization_tokenizer.decode( summary_ids[0], skip_special_tokens=True ) # Post-processing summary if summary_text.strip(): # Hilangkan kutipan jika ada summary_text = re.sub(r'^["\']|["\']$', '', summary_text) summaries.append(summary_text.strip()) else: summaries.append(self._fallback_summarization([text], max_length)[0]) except Exception as e: print(f"Error in T5 summarization: {e}") summaries.append(self._fallback_summarization([text], max_length)[0]) else: # Untuk teks pendek, gunakan teks asli summaries.append(text[:max_length]) return summaries except Exception as e: print(f"Error in summarization: {e}") return self._fallback_summarization(texts, max_length) def _fallback_summarization(self, texts, max_length=100): """Fallback summarization menggunakan ekstraksi kalimat penting""" summaries = [] for text in texts: if len(text.strip()) > 100: # Split menjadi kalimat sentences = re.split(r'[.!?]+', text) sentences = [s.strip() for s in sentences if s.strip()] # Ambil 2-3 kalimat pertama sebagai summary sederhana if len(sentences) > 3: summary = ' '.join(sentences[:3]) else: summary = ' '.join(sentences) summaries.append(summary[:max_length]) else: summaries.append(text[:max_length]) return summaries def perform_ner(self, texts): """Named Entity Recognition""" if not self.models_loaded or self.ner_analyzer is None: return self._fallback_ner(texts) try: all_entities = [] for text in texts: if len(text.strip()) > 10: entities = self.ner_analyzer(text[:512]) # Group entities by type entity_dict = {} for entity in entities: entity_type = entity['entity_group'] if entity_type not in entity_dict: entity_dict[entity_type] = [] entity_dict[entity_type].append(entity['word']) all_entities.append(entity_dict) else: all_entities.append({}) return all_entities except Exception as e: print(f"Error in NER: {e}") return self._fallback_ner(texts) def _fallback_ner(self, texts): """Fallback NER menggunakan pattern matching sederhana""" all_entities = [] for text in texts: entities = {} # Pattern matching sederhana untuk entitas umum # Orang (kapital di tengah kalimat) people = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', text) if people: entities['PER'] = list(set(people)) # Organisasi (mengandung kata tertentu) org_keywords = ['PT', 'CV', 'Inc', 'Corp', 'Company', 'Perusahaan', 'Universitas', 'Institut'] organizations = [] for word in org_keywords: orgs = re.findall(rf'\b{word}\.?\s+[A-Za-z]+\b', text) organizations.extend(orgs) if organizations: entities['ORG'] = organizations # Lokasi (kata kapital yang mungkin lokasi) locations = re.findall(r'\b[A-Z][a-z]+(?: [A-Z][a-z]+)*\b', text) # Filter yang bukan orang (nama depan + belakang) common_locations = ['Indonesia', 'Jakarta', 'Surabaya', 'Bandung', 'Medan', 'Semarang'] locations = [loc for loc in locations if len(loc.split()) == 1 or loc in common_locations] if locations: entities['LOC'] = locations all_entities.append(entities) return all_entities def analyze_media(self, df, content_column='content'): """Pipeline analisis media lengkap""" print("Starting media analysis pipeline...") # Load models jika belum diload if not self.models_loaded: self.load_models() if not self.models_loaded: print("Models failed to load, using basic analysis") return self._basic_analysis(df, content_column) # Filter data yang memiliki content valid_content = df[content_column].notna() & (df[content_column].str.len() > 10) analysis_df = df[valid_content].copy() if len(analysis_df) == 0: print("No valid content to analyze") return df, pd.DataFrame() print(f"Analyzing {len(analysis_df)} articles...") # Pipeline analisis texts = analysis_df[content_column].tolist() # 1. Cleaning dan preprocessing print("Step 1: Cleaning and preprocessing text...") cleaned_texts = self.preprocess_text(texts) # 2. Embedding (sample untuk efisiensi jika data banyak) print("Step 2: Creating embeddings...") if len(cleaned_texts) > 100: sample_texts = cleaned_texts[:100] embeddings = self.create_embeddings(sample_texts) else: embeddings = self.create_embeddings(cleaned_texts) # 3. Emotion Analysis dengan model Indonesia print("Step 3: Analyzing emotions with Indonesian model...") emotions = self.analyze_emotion(cleaned_texts) # 4. Keyword Extraction print("Step 4: Extracting keywords...") keywords = self.extract_keywords(cleaned_texts) # 5. Summarization dengan model T5 Indonesia print("Step 5: Generating summaries with T5 model...") long_text_indices = [i for i, text in enumerate(cleaned_texts) if len(text) > 100] summaries = [""] * len(cleaned_texts) if long_text_indices and self.summarization_model: print(f"Generating summaries for {len(long_text_indices)} long texts...") long_texts = [cleaned_texts[i] for i in long_text_indices] try: long_summaries = self.summarize_text(long_texts, max_length=80, min_length=20) for i, summary in zip(long_text_indices, long_summaries): summaries[i] = summary print("✅ T5 summarization completed successfully") except Exception as e: print(f"❌ T5 summarization failed: {e}") # Fallback untuk teks yang gagal fallback_summaries = self._fallback_summarization(long_texts, 80) for i, summary in zip(long_text_indices, fallback_summaries): summaries[i] = summary else: print("No long texts found for summarization") # 6. NER (sample untuk efisiensi jika data banyak) print("Step 6: Performing NER...") if len(cleaned_texts) > 50: ner_texts = cleaned_texts[:50] ner_results = self.perform_ner(ner_texts) # Extend untuk semua teks ner_results.extend([{}] * (len(cleaned_texts) - len(ner_results))) else: ner_results = self.perform_ner(cleaned_texts) # Tambahkan hasil analisis ke DataFrame analysis_df = analysis_df.reset_index(drop=True) analysis_df['cleaned_content'] = cleaned_texts analysis_df['emotion'] = [e['emotion'] for e in emotions] analysis_df['emotion_confidence'] = [e['confidence'] for e in emotions] analysis_df['keywords'] = keywords analysis_df['summary'] = summaries analysis_df['entities'] = ner_results # Hitung statistik tambahan analysis_df['content_length'] = analysis_df['cleaned_content'].str.len() analysis_df['word_count'] = analysis_df['cleaned_content'].str.split().str.len() analysis_df['has_summary'] = analysis_df['summary'].str.len() > 0 # Buat DataFrame terpisah untuk statistik emotion_distribution = pd.Series([e['emotion'] for e in emotions]).value_counts().to_dict() stats_data = { 'total_articles': len(analysis_df), 'articles_analyzed': len(analysis_df), 'articles_with_emotion': len([e for e in emotions if e['emotion'] != 'netral' and e['confidence'] > 0.3]), 'articles_with_summary': analysis_df['has_summary'].sum(), 'avg_emotion_confidence': np.mean([e['confidence'] for e in emotions]), 'total_keywords': sum(len(kw) for kw in keywords), 'avg_content_length': np.mean(analysis_df['content_length']), 'avg_word_count': np.mean(analysis_df['word_count']), 'emotion_distribution': emotion_distribution, 'total_entities': sum(len(entities) for entities in ner_results), 'summarization_model': 'T5-Indonesian' if self.summarization_model else 'Fallback', 'emotion_model': 'IndoBERT-Emotion' if self.emotion_model else 'Fallback' } stats_df = pd.DataFrame([stats_data]) print("Media analysis completed!") print(f"Emotion distribution: {emotion_distribution}") return analysis_df, stats_df def _basic_analysis(self, df, content_column='content'): """Analisis dasar jika model NLP gagal load""" print("Performing basic analysis...") valid_content = df[content_column].notna() & (df[content_column].str.len() > 10) analysis_df = df[valid_content].copy() if len(analysis_df) == 0: return df, pd.DataFrame() # Basic cleaning analysis_df['cleaned_content'] = analysis_df[content_column].apply(self.clean_text) analysis_df['content_length'] = analysis_df['cleaned_content'].str.len() analysis_df['word_count'] = analysis_df['cleaned_content'].str.split().str.len() # Basic emotion analysis fallback emotions = self._fallback_emotion_analysis(analysis_df['cleaned_content'].tolist()) analysis_df['emotion'] = [e['emotion'] for e in emotions] analysis_df['emotion_confidence'] = [e['confidence'] for e in emotions] # Basic keyword extraction keywords = self._fallback_keyword_extraction(analysis_df['cleaned_content'].tolist()) analysis_df['keywords'] = keywords # Basic summarization fallback summaries = self._fallback_summarization(analysis_df['cleaned_content'].tolist()) analysis_df['summary'] = summaries analysis_df['has_summary'] = analysis_df['summary'].str.len() > 0 # Basic stats stats_data = { 'total_articles': len(analysis_df), 'articles_analyzed': len(analysis_df), 'avg_content_length': analysis_df['content_length'].mean(), 'avg_word_count': analysis_df['word_count'].mean(), 'total_sources': analysis_df['source'].nunique(), 'emotion_distribution': analysis_df['emotion'].value_counts().to_dict(), 'articles_with_summary': analysis_df['has_summary'].sum(), 'summarization_model': 'Fallback', 'emotion_model': 'Fallback' } stats_df = pd.DataFrame([stats_data]) return analysis_df, stats_df # Singleton instance media_analyzer = MediaAnalyzer()