llm-safety-assessment / src /eval_provider.py
jnm38's picture
Add Gradio interface for LLM safety evaluation and implement inference provider cost fetching
cf2b921
import requests
import pandas as pd
from bs4 import BeautifulSoup
HF_INFERENCE_PAGE = "https://huggingface.co/inference/models"
def fetch_inference_page():
"""
Fetch the HF inference providers page
"""
r = requests.get(HF_INFERENCE_PAGE, timeout=60)
r.raise_for_status()
return r.text
def parse_provider_costs(html_content):
"""
Parse provider cost information from the HTML table
Extracts: model, provider, input_cost, output_cost, context_length, latency, speed, tool_calling, structured_output
"""
soup = BeautifulSoup(html_content, 'html.parser')
provider_model_data = []
# Find all table rows - try different selectors
rows = soup.find_all('tr')
print(f"Found {len(rows)} table rows")
for idx, row in enumerate(rows):
cells = row.find_all('td')
if len(cells) >= 8: # Need at least 8 columns for all data
# Debug: print first few rows
if idx < 3:
print(f"Row {idx}: {len(cells)} cells")
for i, cell in enumerate(cells[:9]):
print(f" Cell {i}: {cell.get_text(strip=True)[:50]}")
try:
# Column indices based on the table structure
model_cell = cells[0].get_text(strip=True) if len(cells) > 0 else ""
provider_cell = cells[1].get_text(strip=True) if len(cells) > 1 else ""
input_cost_cell = cells[2].get_text(strip=True) if len(cells) > 2 else "-"
output_cost_cell = cells[3].get_text(strip=True) if len(cells) > 3 else "-"
context_cell = cells[4].get_text(strip=True) if len(cells) > 4 else "-"
latency_cell = cells[5].get_text(strip=True) if len(cells) > 5 else "-"
speed_cell = cells[6].get_text(strip=True) if len(cells) > 6 else "-"
tool_calling_cell = cells[7].get_text(strip=True) if len(cells) > 7 else "-"
structured_cell = cells[8].get_text(strip=True) if len(cells) > 8 else "-"
# Extract model name (format: "org/model-name Copy to clipboard")
model = model_cell.split("Copy to clipboard")[0].strip() if model_cell else None
# Extract provider name (may have "cheapest" or "fastest" tags)
provider = provider_cell.split()[0] if provider_cell else None
if not provider or provider == '-' or not model:
continue
# Parse costs (format: $0.30 or -)
input_cost = None
output_cost = None
if input_cost_cell and input_cost_cell != '-':
try:
input_cost = float(input_cost_cell.replace('$', '').replace(',', '').strip())
except ValueError:
pass
if output_cost_cell and output_cost_cell != '-':
try:
output_cost = float(output_cost_cell.replace('$', '').replace(',', '').strip())
except ValueError:
pass
# Parse context length (number or -)
context_length = None
if context_cell and context_cell != '-':
try:
context_length = int(context_cell.replace(',', '').strip())
except ValueError:
pass
# Parse latency (seconds or -)
latency = None
if latency_cell and latency_cell != '-':
try:
latency = float(latency_cell.strip())
except ValueError:
pass
# Parse speed (tokens/second or -)
speed = None
if speed_cell and speed_cell != '-':
try:
speed = float(speed_cell.strip())
except ValueError:
pass
# Parse tool calling support (Yes/No)
tool_calling = tool_calling_cell if tool_calling_cell in ['Yes', 'No'] else None
# Parse structured output support (Yes/No)
structured_output = structured_cell if structured_cell in ['Yes', 'No'] else None
# Add each (provider, model) pair with all metadata
provider_model_data.append({
'provider': provider,
'model': model,
'input_cost_per_1M': input_cost,
'output_cost_per_1M': output_cost,
'context_length': context_length,
'latency_ttft': latency,
'speed_tokens_per_sec': speed,
'tool_calling': tool_calling,
'structured_output': structured_output
})
except (ValueError, IndexError) as e:
if idx < 3:
print(f" Error parsing row {idx}: {e}")
continue
print(f"Extracted {len(provider_model_data)} provider-model entries")
return provider_model_data
def load_provider_costs(csv_path="hf_provider_model_costs.csv"):
"""
Load provider-model cost data from CSV file
"""
df = pd.read_csv(csv_path)
return df
def find_best_model(prompt,
csv_path="hf_provider_model_costs.csv",
strategy="cheapest_total",
require_both_costs=True,
input_output_ratio=1.0,
min_context_length=None,
max_latency=None,
min_speed=None,
require_tool_calling=False,
require_structured_output=False):
"""
Find the most appropriate model for a given prompt based on cost strategy.
Parameters:
-----------
prompt : str
The user prompt (currently used for length estimation)
csv_path : str
Path to the CSV file with provider-model costs
strategy : str
Selection strategy:
- "cheapest_total": Cheapest combined input+output cost
- "cheapest_input": Cheapest input cost only
# Demo: Find best model for a sample prompt
print("\n" + "="*60)
print("Example: Finding best model for a prompt")
print("="*60)
sample_prompt = "Explain the concept of machine learning in simple terms"
strategies = ["cheapest_total", "cheapest_input", "cheapest_output", "best_value"]
for strategy in strategies:
result = find_best_model(sample_prompt, strategy=strategy)
if result:
print(f"\nStrategy: {strategy}")
print(f" Provider: {result['provider']}")
print(f" Model: {result['model']}")
print(f" Input cost: ${result['input_cost_per_1M']:.2f}/1M tokens")
print(f" Output cost: ${result['output_cost_per_1M']:.2f}/1M tokens")
print(f" Estimated cost for this prompt: ${result['estimated_cost']:.6f}")
- "cheapest_output": Cheapest output cost only
- "best_value": Best value considering typical usage (uses input_output_ratio)
require_both_costs : bool
If True, only consider models with both input and output costs available
input_output_ratio : float
Expected input/output token ratio for "best_value" strategy (default 1.0)
min_context_length : int, optional
Minimum required context length
max_latency : float, optional
Maximum acceptable latency (TTFT in seconds)
min_speed : float, optional
Minimum required speed (tokens/second)
require_tool_calling : bool
If True, only return models that support tool calling
require_structured_output : bool
If True, only return models that support structured output
Returns:
--------
dict with keys: provider, model, input_cost_per_1M, output_cost_per_1M, estimated_cost,
context_length, latency_ttft, speed_tokens_per_sec, tool_calling, structured_output
"""
df = load_provider_costs(csv_path)
# Filter out entries based on cost availability
if require_both_costs:
df_filtered = df.dropna(subset=['input_cost_per_1M', 'output_cost_per_1M']).copy()
else:
df_filtered = df[df['input_cost_per_1M'].notna() | df['output_cost_per_1M'].notna()].copy()
# Apply capability filters
if min_context_length is not None:
df_filtered = df_filtered[df_filtered['context_length'] >= min_context_length]
if max_latency is not None:
df_filtered = df_filtered[(df_filtered['latency_ttft'].notna()) & (df_filtered['latency_ttft'] <= max_latency)]
if min_speed is not None:
df_filtered = df_filtered[(df_filtered['speed_tokens_per_sec'].notna()) & (df_filtered['speed_tokens_per_sec'] >= min_speed)]
if require_tool_calling:
df_filtered = df_filtered[df_filtered['tool_calling'] == 'Yes']
if require_structured_output:
df_filtered = df_filtered[df_filtered['structured_output'] == 'Yes']
if df_filtered.empty:
return None
# Fill NaN costs with 0 for calculation purposes
df_filtered.loc[:, 'input_cost_per_1M'] = df_filtered['input_cost_per_1M'].fillna(0)
df_filtered.loc[:, 'output_cost_per_1M'] = df_filtered['output_cost_per_1M'].fillna(0)
# Estimate prompt length (rough approximation: 1 word ≈ 1.3 tokens)
prompt_tokens = len(prompt.split()) * 1.3 / 1_000_000 # Convert to millions
# Apply selection strategy
if strategy == "cheapest_input":
df_filtered = df_filtered[df_filtered['input_cost_per_1M'] > 0]
if df_filtered.empty:
return None
best = df_filtered.nsmallest(1, 'input_cost_per_1M').iloc[0]
estimated_cost = best['input_cost_per_1M'] * prompt_tokens
elif strategy == "cheapest_output":
df_filtered = df_filtered[df_filtered['output_cost_per_1M'] > 0]
if df_filtered.empty:
return None
best = df_filtered.nsmallest(1, 'output_cost_per_1M').iloc[0]
# Assume output is similar length to input
estimated_cost = best['output_cost_per_1M'] * prompt_tokens
elif strategy == "best_value":
# Calculate weighted cost based on input/output ratio
df_filtered.loc[:, 'weighted_cost'] = (
df_filtered['input_cost_per_1M'] * input_output_ratio +
df_filtered['output_cost_per_1M']
) / (input_output_ratio + 1)
df_filtered = df_filtered[df_filtered['weighted_cost'] > 0]
if df_filtered.empty:
return None
best = df_filtered.nsmallest(1, 'weighted_cost').iloc[0]
estimated_cost = best['weighted_cost'] * prompt_tokens * 2 # Input + output
else: # "cheapest_total" (default)
df_filtered.loc[:, 'total_cost'] = df_filtered['input_cost_per_1M'] + df_filtered['output_cost_per_1M']
df_filtered = df_filtered[df_filtered['total_cost'] > 0]
if df_filtered.empty:
return None
best = df_filtered.nsmallest(1, 'total_cost').iloc[0]
estimated_cost = best['total_cost'] * prompt_tokens
return {
'provider': best['provider'],
'model': best['model'],
'input_cost_per_1M': best['input_cost_per_1M'],
'output_cost_per_1M': best['output_cost_per_1M'],
'estimated_cost': estimated_cost,
'strategy': strategy,
'context_length': best.get('context_length'),
'latency_ttft': best.get('latency_ttft'),
'speed_tokens_per_sec': best.get('speed_tokens_per_sec'),
'tool_calling': best.get('tool_calling'),
'structured_output': best.get('structured_output')
}
def test_find_best_model():
"""
Test function to demonstrate find_best_model with different prompts and strategies
"""
import os
# Check if CSV file exists
csv_path = "hf_provider_model_costs.csv"
if not os.path.exists(csv_path):
print(f"\nERROR: {csv_path} not found!")
print("Please run the script without --test flag first to fetch and generate the CSV file.")
print("Example: python eval_provider.py")
return
print("\n" + "="*70)
print("TESTING: find_best_model function")
print("="*70)
test_prompts = [
"What is the capital of France?",
"Write a detailed essay about the impact of artificial intelligence on modern society, covering economic, social, and ethical implications.",
"Translate this text to Spanish: Hello, how are you today?"
]
for i, prompt in enumerate(test_prompts, 1):
print(f"\n{'='*70}")
print(f"Test {i}: Prompt length = {len(prompt)} chars, ~{len(prompt.split())} words")
print(f"Prompt: {prompt[:60]}...")
print(f"{'='*70}")
strategies = ["cheapest_total", "cheapest_input", "cheapest_output", "best_value"]
for strategy in strategies:
result = find_best_model(prompt, strategy=strategy, require_both_costs=True)
if result:
print(f"\n Strategy: {strategy.upper()}")
print(f" Provider: {result['provider']}")
print(f" Model: {result['model']}")
print(f" Input cost: ${result['input_cost_per_1M']:.4f}/1M tokens")
print(f" Output cost: ${result['output_cost_per_1M']:.4f}/1M tokens")
print(f" Context length: {result['context_length'] if result['context_length'] else 'N/A'}")
print(f" Latency (TTFT): {result['latency_ttft']:.2f}s" if result['latency_ttft'] else " Latency: N/A")
print(f" Speed: {result['speed_tokens_per_sec']:.0f} tok/s" if result['speed_tokens_per_sec'] else " Speed: N/A")
print(f" Tool calling: {result['tool_calling'] or 'N/A'}")
print(f" Structured output: {result['structured_output'] or 'N/A'}")
print(f" Estimated cost: ${result['estimated_cost']:.8f}")
else:
print(f"\n Strategy: {strategy.upper()}")
print(f" No suitable model found")
# Test with require_both_costs=False
print(f"\n{'='*70}")
print("Test: Allow models with partial cost information")
print(f"{'='*70}")
result = find_best_model(test_prompts[0], strategy="cheapest_total", require_both_costs=False)
if result:
print(f" Provider: {result['provider']}")
print(f" Model: {result['model']}")
print(f" Input cost: ${result['input_cost_per_1M']:.4f}/1M tokens")
print(f" Output cost: ${result['output_cost_per_1M']:.4f}/1M tokens")
# Test with capability filters
print(f"\n{'='*70}")
print("Test: Filter by capabilities (tool calling + structured output)")
print(f"{'='*70}")
result = find_best_model(
test_prompts[0],
strategy="cheapest_total",
require_tool_calling=True,
require_structured_output=True,
min_context_length=100000
)
if result:
print(f" Provider: {result['provider']}")
print(f" Model: {result['model']}")
print(f" Context length: {result['context_length']}")
print(f" Tool calling: {result['tool_calling']}")
print(f" Structured output: {result['structured_output']}")
print(f" Input cost: ${result['input_cost_per_1M']:.4f}/1M tokens")
print(f" Output cost: ${result['output_cost_per_1M']:.4f}/1M tokens")
else:
print(" No models found matching criteria")
def main():
import sys
# Check if we should skip fetching and just run tests
if len(sys.argv) > 1 and sys.argv[1] == "--test":
test_find_best_model()
return
print("Fetching inference providers page...")
html_content = fetch_inference_page()
print("Parsing provider costs...")
provider_data = parse_provider_costs(html_content)
if not provider_data:
print("No provider cost data found.")
return
df = pd.DataFrame(provider_data)
df = df.sort_values(["provider", "model"])
df.to_csv("hf_provider_model_costs.csv", index=False)
print(f"\nFound {len(df)} provider-model combinations")
unique_providers = df['provider'].nunique()
unique_models = df['model'].nunique()
print(f"Unique providers: {unique_providers}")
print(f"Unique models: {unique_models}")
print("\nProvider-Model Cost Summary (first 20 rows):")
print(df.head(20).to_string(index=False))
# Run comprehensive tests
test_find_best_model()
if __name__ == "__main__":
main()