= commited on
Commit
d40b9df
·
0 Parent(s):

Initial commit

Browse files
Files changed (8) hide show
  1. .gitignore +50 -0
  2. README.md +55 -0
  3. agent_functions.py +208 -0
  4. app.py +167 -0
  5. dataset_handler.py +249 -0
  6. llm_agent.py +221 -0
  7. llm_client.py +73 -0
  8. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ venv/
9
+ ENV/
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # Data files
27
+ *.json
28
+ students.json
29
+ exams.json
30
+ topics.json
31
+
32
+ # IDE
33
+ .vscode/
34
+ .idea/
35
+ *.swp
36
+ *.swo
37
+ *~
38
+
39
+ # OS
40
+ .DS_Store
41
+ Thumbs.db
42
+
43
+ # Environment variables
44
+ .env
45
+ .env.local
46
+
47
+ # API keys (if stored)
48
+ *.key
49
+ api_key.txt
50
+
README.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Що було зроблено
3
+
4
+ Цей проєкт реалізує LLM-агента з набором інструментів для роботи з датасетом сільськогосподарських досліджень. Ось що було реалізовано:
5
+
6
+ ### Основні компоненти:
7
+
8
+ 1. **LLM-агент з функціональним викликом (Function Calling)**
9
+ - Реалізовано клас `AgriculturalAgent` в `llm_agent.py`, який використовує LiteLLM для взаємодії з Groq API
10
+ - Агент підтримує багатоітераційну обробку запитів з автоматичним викликом функцій
11
+ - Реалізовано систему управління історією розмови для контекстної обробки запитів
12
+
13
+ 2. **Набір інструментів (Tools/Functions)**
14
+ - `search_agricultural_documents` - пошук документів за ключовими словами
15
+ - `get_document_details` - отримання детальної інформації про конкретний документ
16
+ - `browse_topics` - перегляд документів за темами або випадковий вибір
17
+ - `get_dataset_info` - отримання інформації про датасет
18
+ - Всі функції реалізовані в `agent_functions.py` з описом параметрів для LLM
19
+
20
+ 3. **Обробка датасету**
21
+ - Реалізовано `DatasetHandler` в `dataset_handler.py` для роботи з датасетом CGIAR
22
+ - Підтримка streaming-режиму для ефективної роботи з великим датасетом (45,232 документів)
23
+ - Реалізовано пошук за ключовими словами в заголовках, анотаціях та ключових словах
24
+ - Обробка помилок та обмеження часу виконання пошуку
25
+
26
+ 4. **Веб-інтерфейс**
27
+ - Створено інтерфейс на базі Gradio (`app.py`)
28
+ - Поле для введення API ключа (без зберігання в коді)
29
+ - Чат-інтерфейс для взаємодії з агентом
30
+ - Інформаційні блоки з описом можливостей та прикладами використання
31
+
32
+ 5. **Інтеграція з LLM**
33
+ - Використання LiteLLM для уніфікованої роботи з різними LLM провайдерами
34
+ - Налаштування системних промптів для спеціалізації агента на сільськогосподарській тематиці
35
+ - Автоматичне визначення необхідності виклику функцій на основі запиту користувача
36
+
37
+ ### Технічні особливості:
38
+
39
+ - **Streaming режим**: Датасет завантажується в режимі streaming для швидшого старту та економії пам'яті
40
+ - **Обробка помилок**: Реалізовано обробку помилок на всіх рівнях (завантаження датасету, пошук, виклики LLM)
41
+ - **Обмеження пошуку**: Для streaming-режиму встановлено обмеження на кількість перевірених документів (300) для швидкої відповіді
42
+ - **Багатоітераційна обробка**: Агент може виконувати кілька викликів функцій підряд для повної відповіді на запит
43
+ - **Захист від циклів**: Реалізовано захист від нескінченних циклів при однакових викликах функцій
44
+
45
+ ### Відмінності від запропонованого варіанту:
46
+
47
+ Замість агента для прийняття іспиту з курсу NLP, було обрано реалізацію агента для роботи з датасетом сільськогосподарських досліджень, що дозволяє продемонструвати:
48
+ - Роботу з реальним великим датасетом
49
+ - Пошук та фільтрацію інформації
50
+ - Структуровану роботу з документами
51
+ - Практичне застосування LLM-агентів для інформаційного пошуку
52
+
53
+ ## Acknowledgments
54
+
55
+ - Dataset: [CGIAR/gardian-ai-ready-docs](https://huggingface.co/datasets/CGIAR/gardian-ai-ready-docs)
agent_functions.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataset_handler import DatasetHandler
2
+ from typing import List, Dict, Optional
3
+
4
+ dataset_handler = DatasetHandler(use_streaming=True)
5
+
6
+
7
+ def search_agricultural_documents(keyword: str, limit: int = 5) -> str:
8
+ """
9
+ Search for agricultural research documents by keyword.
10
+
11
+ This function searches the CGIAR dataset for documents containing the specified keyword
12
+ in their title, abstract, or keywords. Use this when the user asks about specific
13
+ agricultural topics, crops, techniques, or concepts.
14
+
15
+ Args:
16
+ keyword: The search keyword (e.g., "rice", "pest control", "climate adaptation")
17
+ limit: Maximum number of documents to return (default: 5)
18
+
19
+ Returns:
20
+ A formatted string containing information about matching documents
21
+ """
22
+ try:
23
+ print(f"[FUNCTION] Searching for '{keyword}' (limit: {limit})...")
24
+ results = dataset_handler.search_by_keyword(keyword, limit)
25
+
26
+ if not results:
27
+ return f"No documents found matching '{keyword}' after searching the dataset. The search may have been limited due to network timeouts. Try a different search term or a more specific keyword."
28
+
29
+ response = f"Found {len(results)} document(s) matching '{keyword}':\n\n"
30
+ for i, doc in enumerate(results, 1):
31
+ response += f"{i}. {dataset_handler.format_document_summary(doc)}\n\n"
32
+
33
+ return response
34
+ except Exception as e:
35
+ error_msg = str(e)
36
+ if "timeout" in error_msg.lower() or "timed out" in error_msg.lower():
37
+ return f"Search timed out while accessing the dataset. This can happen when the dataset is under heavy load. Please try again in a moment or use a more specific search term."
38
+ return f"Error searching documents: {error_msg}"
39
+
40
+
41
+ def get_document_details(title: str) -> str:
42
+ """
43
+ Get detailed information about a specific document by its title.
44
+
45
+ Use this function when the user asks for more details about a specific research paper
46
+ or document that was mentioned in previous search results.
47
+
48
+ Args:
49
+ title: The exact title of the document
50
+
51
+ Returns:
52
+ Detailed information about the document including chapters and figures
53
+ """
54
+ try:
55
+ doc = dataset_handler.get_document_by_title(title)
56
+
57
+ if not doc:
58
+ return f"Document with title '{title}' not found. Please check the title and try again."
59
+
60
+ response = f"**Document Details:**\n\n"
61
+ response += dataset_handler.format_document_summary(doc)
62
+
63
+ # Add chapter information
64
+ if doc.get('chapters'):
65
+ response += f"\n**Chapters:** {len(doc['chapters'])} chapters found\n"
66
+ for i, chapter in enumerate(doc['chapters'][:5], 1): # Show first 5 chapters
67
+ response += f" {i}. {chapter.get('head', 'Untitled')}\n"
68
+
69
+ # Add figures information
70
+ if doc.get('figures'):
71
+ response += f"\n**Figures/Tables:** {len(doc['figures'])} found\n"
72
+
73
+ return response
74
+ except Exception as e:
75
+ return f"Error retrieving document: {str(e)}"
76
+
77
+
78
+ def browse_topics(topic: str = None) -> str:
79
+ """
80
+ Browse agricultural research documents by topic.
81
+
82
+ Common topics include: crop management, pest control, climate adaptation,
83
+ farming systems, soil management, water management, sustainable agriculture,
84
+ small-scale farming, agricultural extension, food security.
85
+
86
+ Args:
87
+ topic: Optional specific topic to browse. If None, returns random documents.
88
+
89
+ Returns:
90
+ Information about documents related to the topic
91
+ """
92
+ try:
93
+ if topic:
94
+ results = dataset_handler.search_by_topic(topic, limit=5)
95
+ if not results:
96
+ return f"No documents found for topic '{topic}'. Try a different topic."
97
+
98
+ response = f"Documents related to '{topic}':\n\n"
99
+ for i, doc in enumerate(results, 1):
100
+ response += f"{i}. {dataset_handler.format_document_summary(doc)}\n\n"
101
+ else:
102
+ results = dataset_handler.get_random_documents(limit=3)
103
+ response = "Sample agricultural research documents:\n\n"
104
+ for i, doc in enumerate(results, 1):
105
+ response += f"{i}. {dataset_handler.format_document_summary(doc)}\n\n"
106
+
107
+ return response
108
+ except Exception as e:
109
+ return f"Error browsing topics: {str(e)}"
110
+
111
+
112
+ def get_dataset_info() -> str:
113
+ """
114
+ Get information about the dataset.
115
+
116
+ Returns:
117
+ Information about the CGIAR dataset
118
+ """
119
+ try:
120
+ if not dataset_handler.loaded:
121
+ dataset_handler.load_dataset()
122
+
123
+ if dataset_handler.use_streaming:
124
+ total_docs = "45,232+ (streaming mode)"
125
+ else:
126
+ total_docs = f"{len(dataset_handler.dataset):,}"
127
+
128
+ return f"""**CGIAR Agricultural Research Dataset**
129
+
130
+ This dataset contains {total_docs} agricultural research publications from CGIAR,
131
+ specifically processed for AI applications in agricultural advisory services.
132
+
133
+ **Dataset Features:**
134
+ - Comprehensive collection of agricultural research papers
135
+ - Topics include: crop management, pest control, climate adaptation, farming systems,
136
+ soil management, water management, sustainable agriculture, and more
137
+ - Documents are structured with metadata, abstracts, keywords, chapters, and figures
138
+ - Focus on small-scale producer contexts in low and middle-income countries
139
+
140
+ **Source:** GARDIAN (CGIAR's agri-food data hub)
141
+ **License:** CC-BY-4.0
142
+
143
+ **Note:** Dataset is loaded in streaming mode for faster access.
144
+ """
145
+ except Exception as e:
146
+ return f"Error getting dataset info: {str(e)}"
147
+
148
+
149
+ # List of available functions for the LLM agent
150
+ AVAILABLE_FUNCTIONS = {
151
+ "search_agricultural_documents": {
152
+ "function": search_agricultural_documents,
153
+ "description": "Search for agricultural research documents by keyword. Use when user asks about specific topics, crops, or agricultural concepts.",
154
+ "parameters": {
155
+ "type": "object",
156
+ "properties": {
157
+ "keyword": {
158
+ "type": "string",
159
+ "description": "The search keyword (e.g., 'rice', 'pest control', 'climate adaptation')"
160
+ },
161
+ "limit": {
162
+ "type": "integer",
163
+ "description": "Maximum number of documents to return (default: 5)",
164
+ "default": 5
165
+ }
166
+ },
167
+ "required": ["keyword"]
168
+ }
169
+ },
170
+ "get_document_details": {
171
+ "function": get_document_details,
172
+ "description": "Get detailed information about a specific document by its exact title. Use when user asks for more details about a specific paper.",
173
+ "parameters": {
174
+ "type": "object",
175
+ "properties": {
176
+ "title": {
177
+ "type": "string",
178
+ "description": "The exact title of the document"
179
+ }
180
+ },
181
+ "required": ["title"]
182
+ }
183
+ },
184
+ "browse_topics": {
185
+ "function": browse_topics,
186
+ "description": "Browse documents by agricultural topic or get random sample documents. Common topics: crop management, pest control, climate adaptation, farming systems, etc.",
187
+ "parameters": {
188
+ "type": "object",
189
+ "properties": {
190
+ "topic": {
191
+ "type": "string",
192
+ "description": "Optional specific topic to browse. If not provided, returns random documents."
193
+ }
194
+ },
195
+ "required": []
196
+ }
197
+ },
198
+ "get_dataset_info": {
199
+ "function": get_dataset_info,
200
+ "description": "Get information about the CGIAR dataset itself. Use when user asks about the dataset, its size, or what it contains.",
201
+ "parameters": {
202
+ "type": "object",
203
+ "properties": {},
204
+ "required": []
205
+ }
206
+ }
207
+ }
208
+
app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from llm_agent import AgriculturalAgent
3
+ import os
4
+
5
+ agent = None
6
+
7
+
8
+ def initialize_agent(api_key: str) -> str:
9
+ """
10
+ Initialize the agent with the provided API key.
11
+
12
+ Args:
13
+ api_key: Groq API key
14
+
15
+ Returns:
16
+ Status message
17
+ """
18
+ global agent
19
+ if not api_key or not api_key.strip():
20
+ return "Please enter a valid API key."
21
+
22
+ try:
23
+ agent = AgriculturalAgent(api_key=api_key.strip())
24
+ return "Agent initialized successfully! Dataset loaded and ready. You can now start asking questions."
25
+ except Exception as e:
26
+ return f"Error initializing agent: {str(e)}"
27
+
28
+
29
+ def chat_with_agent(message: str, history: list, api_key: str) -> tuple:
30
+ """
31
+ Handle chat messages with the agent.
32
+
33
+ Args:
34
+ message: User's message
35
+ history: Chat history (Gradio format: list of tuples [(user_msg, assistant_msg), ...])
36
+ api_key: Groq API key
37
+
38
+ Returns:
39
+ Tuple of (empty string, updated history in Gradio format)
40
+ """
41
+ global agent
42
+
43
+ print(f"\n[APP] Received message: {message[:100]}...")
44
+
45
+ if agent is None:
46
+ if not api_key or not api_key.strip():
47
+ history.append((message, "Please initialize the agent with an API key first."))
48
+ return "", history
49
+ print("[APP] Initializing agent...")
50
+ init_msg = initialize_agent(api_key)
51
+ if "Error" in init_msg:
52
+ history.append((message, init_msg))
53
+ return "", history
54
+
55
+ if agent is None:
56
+ history.append((message, "Please initialize the agent with an API key first."))
57
+ return "", history
58
+
59
+ print("[APP] Sending message to agent...")
60
+
61
+ response = agent.chat(message)
62
+ print(f"[APP] Received response from agent: {response[:100]}...")
63
+
64
+ history.append((message, response))
65
+
66
+ return "", history
67
+
68
+
69
+ def reset_chat():
70
+ """Reset the chat conversation."""
71
+ global agent
72
+ if agent:
73
+ agent.reset_conversation()
74
+ return []
75
+
76
+
77
+ with gr.Blocks(title="Agricultural Research AI Agent") as demo:
78
+ gr.Markdown("""
79
+ # 🌾 Agricultural Research AI Agent
80
+
81
+ This AI agent helps you explore and find information from a comprehensive collection of
82
+ **45,232 agricultural research publications** from CGIAR (Consultative Group on International Agricultural Research).
83
+
84
+ ## Features
85
+ - 🔍 Search for research documents on specific agricultural topics
86
+ - 📚 Browse documents by topic (crop management, pest control, climate adaptation, etc.)
87
+ - 📄 Get detailed information about specific research papers
88
+ - 💡 Get insights and answers based on the latest agricultural research
89
+
90
+ ## How to Use
91
+ 1. Enter your Groq API key in the field below (get a free key at https://console.groq.com)
92
+ 2. Click "Initialize Agent" to start
93
+ 3. Ask questions about agricultural topics, search for documents, or browse research papers
94
+
95
+ **Example questions:**
96
+ - "What research is available on rice cultivation?"
97
+ - "Find documents about pest control in agriculture"
98
+ - "Tell me about climate adaptation strategies for small-scale farmers"
99
+ - "What does the dataset contain?"
100
+
101
+ ---
102
+ """)
103
+
104
+ with gr.Row():
105
+ with gr.Column(scale=3):
106
+ api_key_input = gr.Textbox(
107
+ label="Groq API Key",
108
+ placeholder="Enter your Groq API key here...",
109
+ type="password",
110
+ info="Your API key is not stored and is only used for this session. Get a free API key at https://console.groq.com"
111
+ )
112
+ init_btn = gr.Button("Initialize Agent", variant="primary")
113
+ init_status = gr.Textbox(label="Status", interactive=False)
114
+
115
+ with gr.Column(scale=1):
116
+ reset_btn = gr.Button("Reset Chat", variant="secondary")
117
+
118
+ chatbot = gr.Chatbot(
119
+ label="Chat with Agricultural Research Agent",
120
+ height=500
121
+ )
122
+
123
+ with gr.Row():
124
+ msg = gr.Textbox(
125
+ label="Your Question",
126
+ placeholder="Ask about agricultural research, search for documents, or browse topics...",
127
+ scale=4,
128
+ lines=2
129
+ )
130
+ submit_btn = gr.Button("Send", variant="primary", scale=1)
131
+
132
+ init_btn.click(
133
+ fn=initialize_agent,
134
+ inputs=[api_key_input],
135
+ outputs=[init_status]
136
+ )
137
+
138
+ submit_btn.click(
139
+ fn=chat_with_agent,
140
+ inputs=[msg, chatbot, api_key_input],
141
+ outputs=[msg, chatbot]
142
+ )
143
+
144
+ msg.submit(
145
+ fn=chat_with_agent,
146
+ inputs=[msg, chatbot, api_key_input],
147
+ outputs=[msg, chatbot]
148
+ )
149
+
150
+ reset_btn.click(
151
+ fn=reset_chat,
152
+ outputs=[chatbot]
153
+ )
154
+
155
+ gr.Markdown("""
156
+ ---
157
+ **Dataset Information:**
158
+ - Source: [CGIAR/gardian-ai-ready-docs](https://huggingface.co/datasets/CGIAR/gardian-ai-ready-docs) on HuggingFace
159
+ - Contains 45,232 structured agricultural research publications
160
+ - Topics: Crop management, pest control, climate adaptation, farming systems, and more
161
+
162
+ **Note:** This application uses Groq's free API for the LLM agent. Get your free API key at [console.groq.com](https://console.groq.com)
163
+ """)
164
+
165
+ if __name__ == "__main__":
166
+ demo.launch(share=False, server_name="0.0.0.0", server_port=7860, theme=gr.themes.Soft())
167
+
dataset_handler.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from typing import List, Dict, Optional
3
+ import json
4
+
5
+
6
+ class DatasetHandler:
7
+ """Handles loading and searching the CGIAR agricultural dataset."""
8
+
9
+ def __init__(self, use_streaming: bool = True, max_samples: Optional[int] = None):
10
+ """
11
+ Initialize dataset handler.
12
+
13
+ Args:
14
+ use_streaming: If True, use streaming mode (faster, doesn't download all files)
15
+ max_samples: Maximum number of samples to load (None = all, for testing use smaller number)
16
+ """
17
+ self.dataset = None
18
+ self.loaded = False
19
+ self.use_streaming = use_streaming
20
+ self.max_samples = max_samples
21
+
22
+ def load_dataset(self):
23
+ """Load the CGIAR dataset from HuggingFace."""
24
+ if not self.loaded:
25
+ try:
26
+ print("Loading CGIAR dataset from HuggingFace (this may take a moment)...")
27
+
28
+ if self.use_streaming:
29
+ self.dataset = load_dataset(
30
+ "CGIAR/gardian-ai-ready-docs",
31
+ split="train",
32
+ streaming=True
33
+ )
34
+ print("Dataset loaded in streaming mode (lazy loading - files downloaded on-demand only)")
35
+ else:
36
+ if self.max_samples:
37
+ self.dataset = load_dataset(
38
+ "CGIAR/gardian-ai-ready-docs",
39
+ split=f"train[:{self.max_samples}]"
40
+ )
41
+ print(f"Dataset loaded successfully! Loaded {len(self.dataset)} documents (sample)")
42
+ else:
43
+ self.dataset = load_dataset("CGIAR/gardian-ai-ready-docs", split="train")
44
+ print(f"Dataset loaded successfully! Total documents: {len(self.dataset)}")
45
+
46
+ self.loaded = True
47
+ except Exception as e:
48
+ print(f"Error loading dataset: {e}")
49
+ raise
50
+ return self.dataset
51
+
52
+ def search_by_keyword(self, keyword: str, limit: int = 5) -> List[Dict]:
53
+ """
54
+ Search documents by keyword in title, abstract, or keywords.
55
+
56
+ Args:
57
+ keyword: Search keyword
58
+ limit: Maximum number of results to return
59
+
60
+ Returns:
61
+ List of matching documents
62
+ """
63
+ if not self.loaded:
64
+ self.load_dataset()
65
+
66
+ keyword_lower = keyword.lower()
67
+ results = []
68
+ checked = 0
69
+ max_to_check = 300 if self.use_streaming else None
70
+ consecutive_errors = 0
71
+ max_consecutive_errors = 3
72
+
73
+ try:
74
+ for doc in self.dataset:
75
+ try:
76
+ checked += 1
77
+ # Show progress every 100 documents
78
+ if checked % 100 == 0:
79
+ print(f"[DATASET] Checked {checked} documents, found {len(results)} matches so far...")
80
+
81
+ if max_to_check and checked > max_to_check:
82
+ print(f"[DATASET] Reached search limit of {max_to_check} documents")
83
+ break
84
+
85
+ # Search in title
86
+ title = doc.get('title', '').lower()
87
+ # Search in abstract
88
+ abstract = doc.get('abstract', '').lower()
89
+ # Search in keywords
90
+ keywords = ' '.join(doc.get('keywords', [])).lower()
91
+
92
+ if keyword_lower in title or keyword_lower in abstract or keyword_lower in keywords:
93
+ results.append({
94
+ 'title': doc.get('title', ''),
95
+ 'abstract': doc.get('abstract', ''),
96
+ 'keywords': doc.get('keywords', []),
97
+ 'url': doc.get('metadata', {}).get('url', ''),
98
+ 'source': doc.get('metadata', {}).get('source', ''),
99
+ 'pageCount': doc.get('pageCount', 0)
100
+ })
101
+
102
+ consecutive_errors = 0 # Reset on success
103
+
104
+ if len(results) >= limit:
105
+ break
106
+
107
+ except Exception as e:
108
+ consecutive_errors += 1
109
+ if consecutive_errors >= max_consecutive_errors:
110
+ print(f"[DATASET] Too many consecutive errors ({consecutive_errors}), stopping search")
111
+ break
112
+ # Continue to next document
113
+ continue
114
+
115
+ except Exception as e:
116
+ print(f"[DATASET] Error during search: {e}")
117
+ # Return partial results if available
118
+
119
+ if results:
120
+ print(f"[DATASET] Found {len(results)} results after checking {checked} documents")
121
+ else:
122
+ print(f"[DATASET] No results found after checking {checked} documents")
123
+
124
+ return results
125
+
126
+ def search_by_topic(self, topic: str, limit: int = 5) -> List[Dict]:
127
+ """
128
+ Search documents by agricultural topic.
129
+
130
+ Args:
131
+ topic: Agricultural topic (e.g., "crop management", "pest control")
132
+ limit: Maximum number of results to return
133
+
134
+ Returns:
135
+ List of matching documents
136
+ """
137
+ return self.search_by_keyword(topic, limit)
138
+
139
+ def get_document_by_title(self, title: str) -> Optional[Dict]:
140
+ """
141
+ Retrieve a specific document by its title.
142
+
143
+ Args:
144
+ title: Document title
145
+
146
+ Returns:
147
+ Document data or None if not found
148
+ """
149
+ if not self.loaded:
150
+ self.load_dataset()
151
+
152
+ title_lower = title.lower()
153
+ checked = 0
154
+ max_to_check = 300 if self.use_streaming else None # Very aggressive limit
155
+ consecutive_errors = 0
156
+ max_consecutive_errors = 3
157
+
158
+ try:
159
+ for doc in self.dataset:
160
+ try:
161
+ checked += 1
162
+ if max_to_check and checked > max_to_check:
163
+ break
164
+
165
+ if doc.get('title', '').lower() == title_lower:
166
+ return {
167
+ 'title': doc.get('title', ''),
168
+ 'abstract': doc.get('abstract', ''),
169
+ 'keywords': doc.get('keywords', []),
170
+ 'chapters': doc.get('chapters', []),
171
+ 'figures': doc.get('figures', []),
172
+ 'url': doc.get('metadata', {}).get('url', ''),
173
+ 'source': doc.get('metadata', {}).get('source', ''),
174
+ 'pageCount': doc.get('pageCount', 0)
175
+ }
176
+ except Exception as e:
177
+ consecutive_errors += 1
178
+ if consecutive_errors >= max_consecutive_errors:
179
+ break
180
+ continue
181
+ except Exception as e:
182
+ print(f"[DATASET] Error searching for document: {e}")
183
+
184
+ return None
185
+
186
+ def get_random_documents(self, limit: int = 3) -> List[Dict]:
187
+ """
188
+ Get random documents from the dataset.
189
+
190
+ Args:
191
+ limit: Number of documents to return
192
+
193
+ Returns:
194
+ List of random documents
195
+ """
196
+ if not self.loaded:
197
+ self.load_dataset()
198
+
199
+ import random
200
+ results = []
201
+
202
+ if self.use_streaming:
203
+ count = 0
204
+ for doc in self.dataset:
205
+ if count >= limit:
206
+ break
207
+ results.append({
208
+ 'title': doc.get('title', ''),
209
+ 'abstract': doc.get('abstract', ''),
210
+ 'keywords': doc.get('keywords', []),
211
+ 'url': doc.get('metadata', {}).get('url', ''),
212
+ 'source': doc.get('metadata', {}).get('source', ''),
213
+ 'pageCount': doc.get('pageCount', 0)
214
+ })
215
+ count += 1
216
+ else:
217
+ indices = random.sample(range(len(self.dataset)), min(limit, len(self.dataset)))
218
+ for idx in indices:
219
+ doc = self.dataset[idx]
220
+ results.append({
221
+ 'title': doc.get('title', ''),
222
+ 'abstract': doc.get('abstract', ''),
223
+ 'keywords': doc.get('keywords', []),
224
+ 'url': doc.get('metadata', {}).get('url', ''),
225
+ 'source': doc.get('metadata', {}).get('source', ''),
226
+ 'pageCount': doc.get('pageCount', 0)
227
+ })
228
+
229
+ return results
230
+
231
+ def format_document_summary(self, doc: Dict) -> str:
232
+ """
233
+ Format a document for display in the chat.
234
+
235
+ Args:
236
+ doc: Document dictionary
237
+
238
+ Returns:
239
+ Formatted string representation
240
+ """
241
+ summary = f"**Title:** {doc.get('title', 'N/A')}\n"
242
+ summary += f"**Abstract:** {doc.get('abstract', 'N/A')[:500]}...\n"
243
+ if doc.get('keywords'):
244
+ summary += f"**Keywords:** {', '.join(doc.get('keywords', []))}\n"
245
+ summary += f"**Source:** {doc.get('source', 'N/A')}\n"
246
+ if doc.get('url'):
247
+ summary += f"**URL:** {doc.get('url')}\n"
248
+ return summary
249
+
llm_agent.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import List, Dict, Optional
3
+ from litellm import completion
4
+ from agent_functions import AVAILABLE_FUNCTIONS, dataset_handler
5
+ from llm_client import LLMClient
6
+
7
+
8
+ class AgriculturalAgent:
9
+ """LLM Agent for answering questions about agricultural research using the CGIAR dataset."""
10
+
11
+ def __init__(self, api_key: str, model: str = "groq/llama-3.3-70b-versatile"):
12
+ """
13
+ Initialize the agent.
14
+
15
+ Args:
16
+ api_key: Groq API key
17
+ model: Model to use (default: groq/llama-3.3-70b-versatile)
18
+ """
19
+ self.llm_client = LLMClient(model=model, api_key=api_key)
20
+ self.model = model
21
+ self.conversation_history: List[Dict] = []
22
+
23
+ print("Initializing dataset...")
24
+ dataset_handler.load_dataset()
25
+ print("Dataset ready!")
26
+
27
+ def get_tools_schema(self) -> List[Dict]:
28
+ """Get the tools schema for function calling."""
29
+ tools = []
30
+ for func_name, func_info in AVAILABLE_FUNCTIONS.items():
31
+ tools.append({
32
+ "type": "function",
33
+ "function": {
34
+ "name": func_name,
35
+ "description": func_info["description"],
36
+ "parameters": func_info["parameters"]
37
+ }
38
+ })
39
+ return tools
40
+
41
+ def call_function(self, function_name: str, arguments: Dict) -> str:
42
+ """
43
+ Call a function by name with the provided arguments.
44
+
45
+ Args:
46
+ function_name: Name of the function to call
47
+ arguments: Arguments to pass to the function
48
+
49
+ Returns:
50
+ Function result as string
51
+ """
52
+ if function_name not in AVAILABLE_FUNCTIONS:
53
+ return f"Error: Function '{function_name}' not found."
54
+
55
+ func = AVAILABLE_FUNCTIONS[function_name]["function"]
56
+ try:
57
+ result = func(**arguments)
58
+ return result
59
+ except Exception as e:
60
+ return f"Error calling function {function_name}: {str(e)}"
61
+
62
+ def chat(self, user_message: str, system_message: Optional[str] = None) -> str:
63
+ """
64
+ Process a user message and return a response.
65
+
66
+ Args:
67
+ user_message: The user's message
68
+ system_message: Optional system message to override default
69
+
70
+ Returns:
71
+ Agent's response
72
+ """
73
+ print(f"[AGENT] Processing user message: {user_message[:100]}...")
74
+
75
+ if system_message is None:
76
+ system_message = """You are an AI assistant specialized in agricultural research.
77
+ You have access to a comprehensive dataset of agricultural research publications from CGIAR.
78
+
79
+ Your role is to:
80
+ 1. Help users find relevant agricultural research documents
81
+ 2. Answer questions about agricultural topics using information from the dataset
82
+ 3. Provide insights based on the research papers available
83
+
84
+ When a user asks a question:
85
+ - If they ask about a specific topic, crop, or agricultural concept, use the search_agricultural_documents function
86
+ - If they want to browse topics, use the browse_topics function
87
+ - If they ask about a specific document, use get_document_details
88
+ - If they ask about the dataset itself, use get_dataset_info
89
+
90
+ Always be helpful, accurate, and cite the sources when providing information from the dataset.
91
+ If you don't have enough information, suggest searching for more specific documents."""
92
+
93
+ self.conversation_history.append({
94
+ "role": "user",
95
+ "content": user_message
96
+ })
97
+
98
+ messages = [{"role": "system", "content": system_message}]
99
+ messages.extend(self.conversation_history)
100
+
101
+ tools = self.get_tools_schema()
102
+ print(f"[AGENT] Calling LLM API with model: {self.model}")
103
+
104
+ try:
105
+ max_iterations = 5
106
+ iteration = 0
107
+ last_tool_calls = []
108
+ tool_messages = []
109
+
110
+ while iteration < max_iterations:
111
+ iteration += 1
112
+ print(f"[AGENT] LLM API call iteration {iteration}...")
113
+
114
+ response = completion(
115
+ model=self.model,
116
+ messages=messages,
117
+ tools=tools,
118
+ tool_choice="auto",
119
+ temperature=0.1,
120
+ max_tokens=2048
121
+ )
122
+
123
+ message = response.choices[0].message
124
+
125
+ if hasattr(message, 'tool_calls') and message.tool_calls:
126
+ print(f"[AGENT] LLM requested {len(message.tool_calls)} tool call(s)")
127
+
128
+ current_tool_calls = [(tc.function.name if hasattr(tc, 'function') else tc.get('function', {}).get('name'),
129
+ tc.function.arguments if hasattr(tc, 'function') else tc.get('function', {}).get('arguments', '{}'))
130
+ for tc in message.tool_calls]
131
+
132
+ if iteration > 1 and current_tool_calls == last_tool_calls:
133
+ print(f"[AGENT] Warning: Same tool calls detected, breaking loop")
134
+ if tool_messages:
135
+ assistant_response = tool_messages[-1].get('content', '')[:500] + "..."
136
+ else:
137
+ assistant_response = "I encountered an issue processing your request. Please try rephrasing your question."
138
+ break
139
+
140
+ last_tool_calls = current_tool_calls
141
+
142
+ tool_calls_data = []
143
+ for tc in message.tool_calls:
144
+ tool_calls_data.append({
145
+ "id": tc.id if hasattr(tc, 'id') else str(hash(str(tc))),
146
+ "type": tc.type if hasattr(tc, 'type') else "function",
147
+ "function": {
148
+ "name": tc.function.name if hasattr(tc, 'function') else tc.get('function', {}).get('name'),
149
+ "arguments": tc.function.arguments if hasattr(tc, 'function') else tc.get('function', {}).get('arguments', '{}')
150
+ }
151
+ })
152
+
153
+ self.conversation_history.append({
154
+ "role": "assistant",
155
+ "content": message.content or "",
156
+ "tool_calls": tool_calls_data
157
+ })
158
+
159
+ tool_messages = []
160
+ for tc in message.tool_calls:
161
+ function_name = tc.function.name if hasattr(tc, 'function') else tc.get('function', {}).get('name')
162
+ try:
163
+ arguments_str = tc.function.arguments if hasattr(tc, 'function') else tc.get('function', {}).get('arguments', '{}')
164
+ if arguments_str is None or arguments_str == '' or arguments_str == 'null':
165
+ arguments = {}
166
+ else:
167
+ arguments = json.loads(arguments_str)
168
+ except (json.JSONDecodeError, AttributeError, TypeError) as e:
169
+ print(f"[AGENT] Warning: Failed to parse arguments: {e}, using empty dict")
170
+ arguments = {}
171
+
172
+ print(f"[AGENT] Calling function: {function_name} with args: {arguments}")
173
+ function_result = self.call_function(function_name, arguments)
174
+ print(f"[AGENT] Function {function_name} returned result (length: {len(function_result)} chars)")
175
+ tool_call_id = tc.id if hasattr(tc, 'id') else str(hash(str(tc)))
176
+
177
+ tool_messages.append({
178
+ "role": "tool",
179
+ "tool_call_id": tool_call_id,
180
+ "name": function_name,
181
+ "content": function_result
182
+ })
183
+
184
+ self.conversation_history.extend(tool_messages)
185
+
186
+ messages = [{"role": "system", "content": system_message}]
187
+ messages.extend(self.conversation_history)
188
+
189
+ else:
190
+ assistant_response = message.content or "I apologize, but I couldn't generate a response."
191
+ print(f"[AGENT] Generated response (length: {len(assistant_response)} chars)")
192
+ self.conversation_history.append({
193
+ "role": "assistant",
194
+ "content": assistant_response
195
+ })
196
+ return assistant_response
197
+
198
+ if tool_messages:
199
+ print(f"[AGENT] Max iterations reached, returning last tool result")
200
+ assistant_response = tool_messages[-1].get('content', '')
201
+ self.conversation_history.append({
202
+ "role": "assistant",
203
+ "content": assistant_response
204
+ })
205
+ return assistant_response
206
+
207
+ return "I apologize, but I encountered an issue processing your request. Please try again."
208
+
209
+ except Exception as e:
210
+ error_msg = f"Error in chat: {str(e)}"
211
+ print(f"[AGENT] ERROR: {error_msg}")
212
+ self.conversation_history.append({
213
+ "role": "assistant",
214
+ "content": error_msg
215
+ })
216
+ return error_msg
217
+
218
+ def reset_conversation(self):
219
+ """Reset the conversation history."""
220
+ self.conversation_history = []
221
+
llm_client.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional
3
+ from litellm import completion
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+
9
+ class LLMClient:
10
+ """LLM client using LiteLLM"""
11
+
12
+ def __init__(
13
+ self,
14
+ model: str = "groq/llama-3.3-70b-versatile",
15
+ api_key: Optional[str] = None,
16
+ temperature: float = 0.1
17
+ ):
18
+ """
19
+ Initialize LLM client
20
+
21
+ Args:
22
+ model: Model identifier (e.g., "groq/llama-3.3-70b-versatile")
23
+ api_key: API key (if None, uses GROQ_API_KEY env var)
24
+ temperature: Sampling temperature
25
+ """
26
+ self.model = model
27
+ self.temperature = temperature
28
+
29
+ if api_key:
30
+ os.environ["GROQ_API_KEY"] = api_key
31
+ elif "GROQ_API_KEY" not in os.environ:
32
+ api_key = os.getenv("GROQ_API_KEY")
33
+ if not api_key:
34
+ raise ValueError(
35
+ "GROQ_API_KEY not found. Please set it as environment variable "
36
+ "or pass as api_key parameter. Get free key from https://console.groq.com/"
37
+ )
38
+
39
+ def generate(
40
+ self,
41
+ prompt: str,
42
+ max_tokens: int = 512,
43
+ system_prompt: Optional[str] = None
44
+ ) -> str:
45
+ """
46
+ Generate text using LLM
47
+
48
+ Args:
49
+ prompt: User prompt
50
+ max_tokens: Maximum tokens to generate
51
+ system_prompt: Optional system prompt
52
+
53
+ Returns:
54
+ Generated text
55
+ """
56
+ messages = []
57
+ if system_prompt:
58
+ messages.append({"role": "system", "content": system_prompt})
59
+ messages.append({"role": "user", "content": prompt})
60
+
61
+ try:
62
+ response = completion(
63
+ model=self.model,
64
+ messages=messages,
65
+ temperature=self.temperature,
66
+ max_tokens=max_tokens
67
+ )
68
+
69
+ return response.choices[0].message.content
70
+
71
+ except Exception as e:
72
+ raise Exception(f"Error calling LLM: {str(e)}")
73
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ litellm>=1.0.0
3
+ python-dotenv>=1.0.0
4
+ datasets>=2.14.0
5
+ huggingface-hub>=0.16.0
6
+