Spaces:
Sleeping
Sleeping
| """ | |
| Llama 3.2 3B Fine-tuned Chatbot | |
| Fine-tuned conversational model based on FineTome-100k | |
| Deployed on HuggingFace Spaces | |
| """ | |
| import gradio as gr | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| print("π Starting Llama 3.2 3B Chatbot...") | |
| print(f"π Working directory: {os.getcwd()}") | |
| # Download GGUF model (using absolute path) | |
| print("π₯ Downloading model...") | |
| try: | |
| # Use absolute path in HuggingFace Spaces | |
| model_dir = "/app/models" | |
| os.makedirs(model_dir, exist_ok=True) | |
| print(f"π Model directory: {model_dir}") | |
| model_path = hf_hub_download( | |
| repo_id="handsomeLiu/ID2223-llama-3.2-3b-finetune-lora_model_new", | |
| filename="llama-3.2-3b-finetuned-Q4_K_M.gguf", | |
| local_dir=model_dir, # Absolute path | |
| ) | |
| print(model_path) | |
| print(f"β Model downloaded: {model_path}") | |
| # Verify file | |
| if os.path.exists(model_path): | |
| file_size = os.path.getsize(model_path) / (1024**3) | |
| print(f"π File size: {file_size:.2f} GB") | |
| # Check GGUF header | |
| with open(model_path, 'rb') as f: | |
| header = f.read(4) | |
| print(f"π File header: {header}") | |
| if header == b'GGUF': | |
| print("β Valid GGUF header detected") | |
| else: | |
| print(f"β INVALID GGUF! Expected b'GGUF', got {header}") | |
| print("π‘ This file is NOT valid GGUF format") | |
| else: | |
| raise FileNotFoundError(f"File not found: {model_path}") | |
| except Exception as e: | |
| print(f"β Error: {e}") | |
| raise | |
| # Load model into llama.cpp | |
| print("\nπ§ Loading model into llama.cpp...") | |
| try: | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=2048, | |
| n_threads=4, | |
| n_gpu_layers=0, | |
| verbose=True # Enable verbose logging for debugging | |
| ) | |
| print("β Model loaded successfully!") | |
| except Exception as e: | |
| print(f"β Load failed: {e}") | |
| print(f"π File: {model_path}") | |
| print(f"π Size: {os.path.getsize(model_path) / (1024**3):.2f} GB") | |
| print("\nπ‘ File downloaded but cannot be loaded by llama.cpp") | |
| print(" β The file is NOT in valid GGUF format") | |
| print("\nπ― Solution: Use unsloth in Colab to generate correct GGUF") | |
| raise | |
| def chat(message, history, temperature, top_p, max_tokens): | |
| """ | |
| Chat function for generating responses | |
| Args: | |
| message: Current user input | |
| history: Conversation history [{'role': 'user', 'content': '...'}, ...] | |
| temperature: Temperature parameter (controls randomness) | |
| top_p: Nucleus sampling parameter | |
| max_tokens: Maximum number of tokens to generate | |
| """ | |
| # Build Llama 3.1 format prompt | |
| prompt = "<|begin_of_text|>" | |
| # Add conversation history | |
| for msg in history: | |
| role = msg['role'] | |
| content = msg['content'] | |
| prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}<|eot_id|>" | |
| # Add current user message | |
| prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|>" | |
| prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n" | |
| # Generate response | |
| try: | |
| output = llm( | |
| prompt, | |
| max_tokens=int(max_tokens), | |
| temperature=temperature, | |
| top_p=top_p, | |
| echo=False, | |
| stop=["<|eot_id|>", "<|end_of_text|>"] # Stop tokens | |
| ) | |
| response = output['choices'][0]['text'].strip() | |
| return response | |
| except Exception as e: | |
| return f"β Generation error: {str(e)}" | |
| def chat_wrapper(message, history, temperature, top_p, max_tokens): | |
| """ | |
| Wrapper for chat function to handle Gradio Blocks interaction | |
| """ | |
| if not message: | |
| return "", history | |
| # Add user message to history | |
| new_history = history + [{"role": "user", "content": message}] | |
| yield "", new_history | |
| # Generate response | |
| response = chat(message, history, temperature, top_p, max_tokens) | |
| # Add assistant response to history | |
| new_history.append({"role": "assistant", "content": response}) | |
| yield "", new_history | |
| # Create Gradio interface using Blocks for full control | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Llama 3.2 3B Chatbot") as demo: | |
| gr.Markdown(""" | |
| # π¦ Llama 3.2 3B Fine-tuned Chatbot | |
| Llama 3.2 3B model fine-tuned on **FineTome-100k** dataset | |
| - πΎ **Model**: Llama 3.2 3B Instruct + LoRA | |
| - π **Data**: 100k high-quality conversations | |
| - βοΈ **Quantization**: Q4_K_M (CPU optimized) | |
| - π **GitHub**: [ID2223_lab2](https://github.com/Jiananliu12138/ID2223_lab2) | |
| """) | |
| chatbot = gr.Chatbot( | |
| height=500, | |
| show_label=False, | |
| avatar_images=(None, "π¦"), | |
| type="messages" | |
| ) | |
| with gr.Row(): | |
| msg = gr.Textbox( | |
| placeholder="Enter your question...", | |
| container=False, | |
| scale=7 | |
| ) | |
| submit_btn = gr.Button("π€ Send", scale=1) | |
| with gr.Accordion(label="βοΈ Advanced Settings", open=False): | |
| temperature = gr.Slider( | |
| minimum=0, | |
| maximum=2, | |
| value=0.7, | |
| step=0.1, | |
| label="π‘οΈ Temperature", | |
| info="Controls randomness: low=deterministic, high=creative" | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0, | |
| maximum=1, | |
| value=0.9, | |
| step=0.05, | |
| label="π― Top P", | |
| info="Nucleus sampling: only consider top P probability tokens" | |
| ) | |
| max_tokens = gr.Slider( | |
| minimum=64, | |
| maximum=512, | |
| value=256, | |
| step=64, | |
| label="π Max Tokens", | |
| info="Maximum generation length" | |
| ) | |
| # Bind click event ONLY to the button (disable enter key submission) | |
| submit_btn.click( | |
| fn=chat_wrapper, | |
| inputs=[msg, chatbot, temperature, top_p, max_tokens], | |
| outputs=[msg, chatbot], | |
| concurrency_limit=1 | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### π Training Details | |
| - **Base Model**: unsloth/Llama-3.2-3B-Instruct | |
| - **Fine-tuning Method**: LoRA (r=16, alpha=16) | |
| - **Training Data**: FineTome-100k | |
| - **Training Steps**: 1 full epoch (~12,500 steps) | |
| - **Checkpoint**: Saved every 500 steps | |
| ### π‘ Usage Tips | |
| - **Temperature 0.3-0.7**: Best for factual Q&A | |
| - **Temperature 0.7-1.2**: Best for creative writing | |
| - **Max Tokens 128**: Best for short answers | |
| - **Max Tokens 256-512**: Best for detailed explanations | |
| ### π― Project Features | |
| β Complete checkpoint mechanism (supports resume training) | |
| β Saved to Google Drive (permanent storage) | |
| β Converted to GGUF format (CPU inference optimized) | |
| β Deployed on HuggingFace Spaces (free access) | |
| **Made for ID2223 Scalable Machine Learning Course** | |
| """) | |
| # Launch the application | |
| if __name__ == "__main__": | |
| demo.queue() # Enable queue for multi-user support | |
| demo.launch( | |
| share=False, | |
| show_error=True, | |
| server_name="0.0.0.0", | |
| server_port=7860 | |
| ) | |