from dotenv import load_dotenv import os import argparse import json from huggingface_hub import login from transformers import AutoModelForCausalLM, AutoTokenizer from datasets import load_dataset import torch load_dotenv() hf_token = os.getenv('HF_TOKEN') if hf_token: try: login(hf_token) print("Successfully logged in to Hugging Face Hub") except Exception as e: print(f"Warning: Failed to login to Hugging Face Hub: {e}") else: print("Warning: HF_TOKEN not found in environment variables") def parse_args(): parser = argparse.ArgumentParser(description="Evaluate LLM safety") parser.add_argument("--model_name", default="HuggingFaceTB/SmolLM3-3B", help="Model name") parser.add_argument("--dataset_name", default="ai-safety-institute/AgentHarm", help="Dataset name") parser.add_argument("--config", default=None, help="Dataset config") parser.add_argument("--split", default=None, help="Dataset split, if None use first available") parser.add_argument("--num_samples", type=int, default=5, help="Number of samples") parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature") parser.add_argument("--max_tokens", type=int, default=100, help="Max tokens to generate") parser.add_argument("--top_p", type=float, default=1.0, help="Top-p sampling") parser.add_argument("--top_k", type=int, default=-1, help="Top-k sampling (-1 for no limit)") parser.add_argument("--seed", type=int, default=None, help="Random seed") parser.add_argument("--repetition_penalty", type=float, default=1.0, help="Repetition penalty") parser.add_argument("--prompt_field", default=None, help="Field name for prompts, if None use auto-detection") parser.add_argument("--output_file", default=None, help="Output file to save results (JSON format)") return parser.parse_args() def main(): args = parse_args() # Load a small LLM for testing (you can change to a larger model if needed) model_name = args.model_name print(f"Loading model: {model_name}") device = "cuda" if torch.cuda.is_available() else "cpu" # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Load model model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, torch_dtype=torch.float16 if device == "cuda" else torch.float32, device_map="auto" if device == "cuda" else None ) if device == "cpu": model = model.to(device) # Load one of the safety datasets, e.g., AgentHarm harmless_benign dataset_name = args.dataset_name config = args.config print(f"Loading dataset: {dataset_name} with config {config}") dataset = load_dataset(dataset_name, config) # Use the first split, assume 'train' or whatever is available split = args.split if args.split else list(dataset.keys())[0] print(f"Using split: {split}") # Sample a few examples (e.g., 5) to test num_samples = args.num_samples samples = dataset[split].select(range(min(num_samples, len(dataset[split])))) # Prepare prompts prompts = [] for example in samples: if args.prompt_field and args.prompt_field in example: prompts.append(example[args.prompt_field]) elif 'prompt' in example: prompts.append(example['prompt']) elif 'chosen' in example and isinstance(example['chosen'], str): # For hh-rlhf, extract the human prompt lines = example['chosen'].split('\n\n') for line in lines: if line.startswith('Human:'): prompts.append(line.replace('Human: ', '')) break else: prompts.append(str(example)) # Fallback # Generate responses print("Generating responses...") outputs = [] for i, prompt in enumerate(prompts): print(f"Generating response {i+1}/{len(prompts)}...") inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=32000) inputs = {k: v.to(model.device) for k, v in inputs.items()} gen_kwargs = { "max_new_tokens": args.max_tokens, "temperature": args.temperature, "top_p": args.top_p, "do_sample": args.temperature > 0, "pad_token_id": tokenizer.pad_token_id, "eos_token_id": tokenizer.eos_token_id, "repetition_penalty": args.repetition_penalty } if args.top_k > 0: gen_kwargs["top_k"] = int(args.top_k) if args.seed is not None: torch.manual_seed(int(args.seed)) if torch.cuda.is_available(): torch.cuda.manual_seed(int(args.seed)) with torch.no_grad(): output_ids = model.generate(**inputs, **gen_kwargs) # Decode only the generated part (excluding input) generated_text = tokenizer.decode(output_ids[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) outputs.append({"text": generated_text}) # Print results results = [] for i, output in enumerate(outputs): result = { "sample": i + 1, "prompt": prompts[i], "response": output["text"] } results.append(result) print(f"\n--- Sample {i+1} ---") print(f"Prompt: {prompts[i]}") print(f"Response: {output['text']}") if args.output_file: with open(args.output_file, 'w') as f: json.dump(results, f, indent=2) print(f"\nResults saved to {args.output_file}") if __name__ == "__main__": main()