iris / app.py
Jiananliu12138
update
335e3ed
"""
Llama 3.2 3B Fine-tuned Chatbot
Fine-tuned conversational model based on FineTome-100k
Deployed on HuggingFace Spaces
"""
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
print("πŸš€ Starting Llama 3.2 3B Chatbot...")
print(f"πŸ“‚ Working directory: {os.getcwd()}")
# Download GGUF model (using absolute path)
print("πŸ“₯ Downloading model...")
try:
# Use absolute path in HuggingFace Spaces
model_dir = "/app/models"
os.makedirs(model_dir, exist_ok=True)
print(f"πŸ“‚ Model directory: {model_dir}")
model_path = hf_hub_download(
repo_id="handsomeLiu/ID2223-llama-3.2-3b-finetune-lora_model_new",
filename="llama-3.2-3b-finetuned-Q4_K_M.gguf",
local_dir=model_dir, # Absolute path
)
print(model_path)
print(f"βœ… Model downloaded: {model_path}")
# Verify file
if os.path.exists(model_path):
file_size = os.path.getsize(model_path) / (1024**3)
print(f"πŸ“Š File size: {file_size:.2f} GB")
# Check GGUF header
with open(model_path, 'rb') as f:
header = f.read(4)
print(f"πŸ” File header: {header}")
if header == b'GGUF':
print("βœ… Valid GGUF header detected")
else:
print(f"❌ INVALID GGUF! Expected b'GGUF', got {header}")
print("πŸ’‘ This file is NOT valid GGUF format")
else:
raise FileNotFoundError(f"File not found: {model_path}")
except Exception as e:
print(f"❌ Error: {e}")
raise
# Load model into llama.cpp
print("\nπŸ”§ Loading model into llama.cpp...")
try:
llm = Llama(
model_path=model_path,
n_ctx=2048,
n_threads=4,
n_gpu_layers=0,
verbose=True # Enable verbose logging for debugging
)
print("βœ… Model loaded successfully!")
except Exception as e:
print(f"❌ Load failed: {e}")
print(f"πŸ“‚ File: {model_path}")
print(f"πŸ“Š Size: {os.path.getsize(model_path) / (1024**3):.2f} GB")
print("\nπŸ’‘ File downloaded but cannot be loaded by llama.cpp")
print(" β†’ The file is NOT in valid GGUF format")
print("\n🎯 Solution: Use unsloth in Colab to generate correct GGUF")
raise
def chat(message, history, temperature, top_p, max_tokens):
"""
Chat function for generating responses
Args:
message: Current user input
history: Conversation history [{'role': 'user', 'content': '...'}, ...]
temperature: Temperature parameter (controls randomness)
top_p: Nucleus sampling parameter
max_tokens: Maximum number of tokens to generate
"""
# Build Llama 3.1 format prompt
prompt = "<|begin_of_text|>"
# Add conversation history
for msg in history:
role = msg['role']
content = msg['content']
prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}<|eot_id|>"
# Add current user message
prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|>"
prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n"
# Generate response
try:
output = llm(
prompt,
max_tokens=int(max_tokens),
temperature=temperature,
top_p=top_p,
echo=False,
stop=["<|eot_id|>", "<|end_of_text|>"] # Stop tokens
)
response = output['choices'][0]['text'].strip()
return response
except Exception as e:
return f"❌ Generation error: {str(e)}"
def chat_wrapper(message, history, temperature, top_p, max_tokens):
"""
Wrapper for chat function to handle Gradio Blocks interaction
"""
if not message:
return "", history
# Add user message to history
new_history = history + [{"role": "user", "content": message}]
yield "", new_history
# Generate response
response = chat(message, history, temperature, top_p, max_tokens)
# Add assistant response to history
new_history.append({"role": "assistant", "content": response})
yield "", new_history
# Create Gradio interface using Blocks for full control
with gr.Blocks(theme=gr.themes.Soft(), title="Llama 3.2 3B Chatbot") as demo:
gr.Markdown("""
# πŸ¦™ Llama 3.2 3B Fine-tuned Chatbot
Llama 3.2 3B model fine-tuned on **FineTome-100k** dataset
- πŸ’Ύ **Model**: Llama 3.2 3B Instruct + LoRA
- πŸ“Š **Data**: 100k high-quality conversations
- βš™οΈ **Quantization**: Q4_K_M (CPU optimized)
- πŸ”— **GitHub**: [ID2223_lab2](https://github.com/Jiananliu12138/ID2223_lab2)
""")
chatbot = gr.Chatbot(
height=500,
show_label=False,
avatar_images=(None, "πŸ¦™"),
type="messages"
)
with gr.Row():
msg = gr.Textbox(
placeholder="Enter your question...",
container=False,
scale=7
)
submit_btn = gr.Button("πŸ“€ Send", scale=1)
with gr.Accordion(label="βš™οΈ Advanced Settings", open=False):
temperature = gr.Slider(
minimum=0,
maximum=2,
value=0.7,
step=0.1,
label="🌑️ Temperature",
info="Controls randomness: low=deterministic, high=creative"
)
top_p = gr.Slider(
minimum=0,
maximum=1,
value=0.9,
step=0.05,
label="🎯 Top P",
info="Nucleus sampling: only consider top P probability tokens"
)
max_tokens = gr.Slider(
minimum=64,
maximum=512,
value=256,
step=64,
label="πŸ“ Max Tokens",
info="Maximum generation length"
)
# Bind click event ONLY to the button (disable enter key submission)
submit_btn.click(
fn=chat_wrapper,
inputs=[msg, chatbot, temperature, top_p, max_tokens],
outputs=[msg, chatbot],
concurrency_limit=1
)
gr.Markdown("""
---
### πŸ“Š Training Details
- **Base Model**: unsloth/Llama-3.2-3B-Instruct
- **Fine-tuning Method**: LoRA (r=16, alpha=16)
- **Training Data**: FineTome-100k
- **Training Steps**: 1 full epoch (~12,500 steps)
- **Checkpoint**: Saved every 500 steps
### πŸ’‘ Usage Tips
- **Temperature 0.3-0.7**: Best for factual Q&A
- **Temperature 0.7-1.2**: Best for creative writing
- **Max Tokens 128**: Best for short answers
- **Max Tokens 256-512**: Best for detailed explanations
### 🎯 Project Features
βœ… Complete checkpoint mechanism (supports resume training)
βœ… Saved to Google Drive (permanent storage)
βœ… Converted to GGUF format (CPU inference optimized)
βœ… Deployed on HuggingFace Spaces (free access)
**Made for ID2223 Scalable Machine Learning Course**
""")
# Launch the application
if __name__ == "__main__":
demo.queue() # Enable queue for multi-user support
demo.launch(
share=False,
show_error=True,
server_name="0.0.0.0",
server_port=7860
)