Spaces:

handsomeLiu
/

iris

Sleeping

iris / app.py

Jiananliu12138

update

335e3ed 2 months ago

7.21 kB

	"""
	Llama 3.2 3B Fine-tuned Chatbot
	Fine-tuned conversational model based on FineTome-100k
	Deployed on HuggingFace Spaces
	"""

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import os

	print("🚀 Starting Llama 3.2 3B Chatbot...")
	print(f"📂 Working directory: {os.getcwd()}")

	# Download GGUF model (using absolute path)
	print("📥 Downloading model...")
	try:
	# Use absolute path in HuggingFace Spaces
	model_dir = "/app/models"
	os.makedirs(model_dir, exist_ok=True)
	print(f"📂 Model directory: {model_dir}")

	model_path = hf_hub_download(
	repo_id="handsomeLiu/ID2223-llama-3.2-3b-finetune-lora_model_new",
	filename="llama-3.2-3b-finetuned-Q4_K_M.gguf",
	local_dir=model_dir, # Absolute path
	)
	print(model_path)
	print(f"✅ Model downloaded: {model_path}")

	# Verify file
	if os.path.exists(model_path):
	file_size = os.path.getsize(model_path) / (1024**3)
	print(f"📊 File size: {file_size:.2f} GB")

	# Check GGUF header
	with open(model_path, 'rb') as f:
	header = f.read(4)
	print(f"🔍 File header: {header}")
	if header == b'GGUF':
	print("✅ Valid GGUF header detected")
	else:
	print(f"❌ INVALID GGUF! Expected b'GGUF', got {header}")
	print("💡 This file is NOT valid GGUF format")
	else:
	raise FileNotFoundError(f"File not found: {model_path}")

	except Exception as e:
	print(f"❌ Error: {e}")
	raise

	# Load model into llama.cpp
	print("\n🔧 Loading model into llama.cpp...")
	try:
	llm = Llama(
	model_path=model_path,
	n_ctx=2048,
	n_threads=4,
	n_gpu_layers=0,
	verbose=True # Enable verbose logging for debugging
	)
	print("✅ Model loaded successfully!")
	except Exception as e:
	print(f"❌ Load failed: {e}")
	print(f"📂 File: {model_path}")
	print(f"📊 Size: {os.path.getsize(model_path) / (1024**3):.2f} GB")
	print("\n💡 File downloaded but cannot be loaded by llama.cpp")
	print(" → The file is NOT in valid GGUF format")
	print("\n🎯 Solution: Use unsloth in Colab to generate correct GGUF")
	raise

	def chat(message, history, temperature, top_p, max_tokens):
	"""
	Chat function for generating responses

	Args:
	message: Current user input
	history: Conversation history [{'role': 'user', 'content': '...'}, ...]
	temperature: Temperature parameter (controls randomness)
	top_p: Nucleus sampling parameter
	max_tokens: Maximum number of tokens to generate
	"""
	# Build Llama 3.1 format prompt
	prompt = "<\|begin_of_text\|>"

	# Add conversation history
	for msg in history:
	role = msg['role']
	content = msg['content']
	prompt += f"<\|start_header_id\|>{role}<\|end_header_id\|>\n\n{content}<\|eot_id\|>"

	# Add current user message
	prompt += f"<\|start_header_id\|>user<\|end_header_id\|>\n\n{message}<\|eot_id\|>"
	prompt += "<\|start_header_id\|>assistant<\|end_header_id\|>\n\n"

	# Generate response
	try:
	output = llm(
	prompt,
	max_tokens=int(max_tokens),
	temperature=temperature,
	top_p=top_p,
	echo=False,
	stop=["<\|eot_id\|>", "<\|end_of_text\|>"] # Stop tokens
	)
	response = output['choices'][0]['text'].strip()
	return response
	except Exception as e:
	return f"❌ Generation error: {str(e)}"

	def chat_wrapper(message, history, temperature, top_p, max_tokens):
	"""
	Wrapper for chat function to handle Gradio Blocks interaction
	"""
	if not message:
	return "", history

	# Add user message to history
	new_history = history + [{"role": "user", "content": message}]
	yield "", new_history

	# Generate response
	response = chat(message, history, temperature, top_p, max_tokens)

	# Add assistant response to history
	new_history.append({"role": "assistant", "content": response})
	yield "", new_history

	# Create Gradio interface using Blocks for full control
	with gr.Blocks(theme=gr.themes.Soft(), title="Llama 3.2 3B Chatbot") as demo:
	gr.Markdown("""
	# 🦙 Llama 3.2 3B Fine-tuned Chatbot

	Llama 3.2 3B model fine-tuned on FineTome-100k dataset

	- 💾 Model: Llama 3.2 3B Instruct + LoRA
	- 📊 Data: 100k high-quality conversations
	- ⚙️ Quantization: Q4_K_M (CPU optimized)
	- 🔗 GitHub: [ID2223_lab2](https://github.com/Jiananliu12138/ID2223_lab2)
	""")

	chatbot = gr.Chatbot(
	height=500,
	show_label=False,
	avatar_images=(None, "🦙"),
	type="messages"
	)

	with gr.Row():
	msg = gr.Textbox(
	placeholder="Enter your question...",
	container=False,
	scale=7
	)
	submit_btn = gr.Button("📤 Send", scale=1)

	with gr.Accordion(label="⚙️ Advanced Settings", open=False):
	temperature = gr.Slider(
	minimum=0,
	maximum=2,
	value=0.7,
	step=0.1,
	label="🌡️ Temperature",
	info="Controls randomness: low=deterministic, high=creative"
	)
	top_p = gr.Slider(
	minimum=0,
	maximum=1,
	value=0.9,
	step=0.05,
	label="🎯 Top P",
	info="Nucleus sampling: only consider top P probability tokens"
	)
	max_tokens = gr.Slider(
	minimum=64,
	maximum=512,
	value=256,
	step=64,
	label="📏 Max Tokens",
	info="Maximum generation length"
	)

	# Bind click event ONLY to the button (disable enter key submission)
	submit_btn.click(
	fn=chat_wrapper,
	inputs=[msg, chatbot, temperature, top_p, max_tokens],
	outputs=[msg, chatbot],
	concurrency_limit=1
	)

	gr.Markdown("""
	---
	### 📊 Training Details

	- Base Model: unsloth/Llama-3.2-3B-Instruct
	- Fine-tuning Method: LoRA (r=16, alpha=16)
	- Training Data: FineTome-100k
	- Training Steps: 1 full epoch (~12,500 steps)
	- Checkpoint: Saved every 500 steps

	### 💡 Usage Tips

	- Temperature 0.3-0.7: Best for factual Q&A
	- Temperature 0.7-1.2: Best for creative writing
	- Max Tokens 128: Best for short answers
	- Max Tokens 256-512: Best for detailed explanations

	### 🎯 Project Features

	✅ Complete checkpoint mechanism (supports resume training)
	✅ Saved to Google Drive (permanent storage)
	✅ Converted to GGUF format (CPU inference optimized)
	✅ Deployed on HuggingFace Spaces (free access)

	Made for ID2223 Scalable Machine Learning Course
	""")

	# Launch the application
	if __name__ == "__main__":
	demo.queue() # Enable queue for multi-user support
	demo.launch(
	share=False,
	show_error=True,
	server_name="0.0.0.0",
	server_port=7860
	)