Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on Jan 23, 2025

Commit

36a4fe5

1 Parent(s): 7f122e2

Multiple sources OK

Browse files

Files changed (6) hide show

climateqa/engine/chains/answer_rag.py +1 -1
climateqa/engine/chains/prompts.py +26 -1
climateqa/engine/chains/query_transformation.py +78 -13
climateqa/engine/chains/retrieve_documents.py +12 -8
climateqa/engine/graph.py +28 -9
climateqa/event_handler.py +1 -1

climateqa/engine/chains/answer_rag.py CHANGED Viewed

@@ -11,7 +11,7 @@ import time
 from ..utils import rename_chain, pass_values
-DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
 def _combine_documents(
     docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"

 from ..utils import rename_chain, pass_values
+DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="Source : {source} - {page_content}")
 def _combine_documents(
     docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"

climateqa/engine/chains/prompts.py CHANGED Viewed

@@ -36,6 +36,30 @@ You are given a question and extracted passages of the IPCC and/or IPBES reports
 """
 answer_prompt_template = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
@@ -50,6 +74,8 @@ Guidelines:
 - If the documents do not have the information needed to answer the question, just say you do not have enough information.
 - Consider by default that the question is about the past century unless it is specified otherwise.
 - If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
 -----------------------
 Passages:
@@ -60,7 +86,6 @@ Question: {query} - Explained to {audience}
 Answer in {language} with the passages citations:
 """
 papers_prompt_template = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted abstracts of scientific papers. Provide a clear and structured answer based on the abstracts provided, the context and the guidelines.

 """
+# answer_prompt_template_old = """
+# You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
+# Guidelines:
+# - If the passages have useful facts or numbers, use them in your answer.
+# - When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
+# - Do not use the sentence 'Doc i says ...' to say where information came from.
+# - If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
+# - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
+# - If it makes sense, use bullet points and lists to make your answers easier to understand.
+# - You do not need to use every passage. Only use the ones that help answer the question.
+# - If the documents do not have the information needed to answer the question, just say you do not have enough information.
+# - Consider by default that the question is about the past century unless it is specified otherwise.
+# - If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
+# -----------------------
+# Passages:
+# {context}
+# -----------------------
+# Question: {query} - Explained to {audience}
+# Answer in {language} with the passages citations:
+# """
 answer_prompt_template = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
 - If the documents do not have the information needed to answer the question, just say you do not have enough information.
 - Consider by default that the question is about the past century unless it is specified otherwise.
 - If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
+- If you receive passages from different reports, eg IPCC and PPCP, make separate paragraphs and specify the source of the information in your answer, eg "According to IPCC, ...".
+- The different sources are IPCC, IPBES, PPCP (for Plan Climat Air Energie Territorial de Paris), PBDP (for Plan Biodiversité de Paris), Acclimaterra.
 -----------------------
 Passages:
 Answer in {language} with the passages citations:
 """
 papers_prompt_template = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted abstracts of scientific papers. Provide a clear and structured answer based on the abstracts provided, the context and the guidelines.

climateqa/engine/chains/query_transformation.py CHANGED Viewed

@@ -60,7 +60,8 @@ from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
 ROUTING_INDEX = {
-    "Vector":["IPCC","IPBES","IPOS", "AcclimaTerra"],
     "OpenAlex":["OpenAlex"],
 }
@@ -88,6 +89,17 @@ class Location(BaseModel):
     country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
     location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
 class QueryAnalysis(BaseModel):
     """
     Analyze the user query to extract the relevant sources
@@ -98,14 +110,16 @@ class QueryAnalysis(BaseModel):
     Also provide simple keywords to feed a search engine
     """
-    sources: List[Literal["IPCC", "IPBES", "IPOS", "AcclimaTerra"]] = Field( #,"OpenAlex"]] = Field(
         ...,
         description="""
             Given a user question choose which documents would be most relevant for answering their question,
             - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
             - IPBES is for questions about biodiversity and nature
             - IPOS is for questions about the ocean and deep sea mining
-            - AcclimaTerra is for questions about any specific place in, or close to, the french region "Nouvelle-Aquitaine"
         """,
     )
@@ -142,7 +156,25 @@ def make_query_analysis_chain(llm):
     prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
         ("user", "input: {input}")
     ])
@@ -150,6 +182,16 @@ def make_query_analysis_chain(llm):
     chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
     return chain
 def make_query_transform_node(llm,k_final=15):
     """
@@ -172,12 +214,13 @@ def make_query_transform_node(llm,k_final=15):
     decomposition_chain = make_query_decomposition_chain(llm)
     query_analysis_chain = make_query_analysis_chain(llm)
     def transform_query(state):
         print("---- Transform query ----")
-        auto_mode = state.get("sources_auto", False)
-        sources_input = state.get("sources_input", ROUTING_INDEX["Vector"])
         new_state = {}
@@ -186,6 +229,7 @@ def make_query_transform_node(llm,k_final=15):
         decomposition_output = decomposition_chain.invoke({"input":state["query"]})
         new_state.update(decomposition_output)
         # Query Analysis
         questions = []
         for question in new_state["questions"]:
@@ -194,16 +238,32 @@ def make_query_transform_node(llm,k_final=15):
             # TODO WARNING llm should always return smthg
             # The case when the llm does not return any sources or wrong ouput
-            if not query_analysis_output["sources"] or not all(source in ["IPCC", "IPBS", "IPOS"] for source in query_analysis_output["sources"]):
                 query_analysis_output["sources"] = ["IPCC", "IPBES", "IPOS"]
-            question_state.update(query_analysis_output)
-            questions.append(question_state)
         # Explode the questions into multiple questions with different sources
         new_questions = []
         for q in questions:
-            question,sources = q["question"],q["sources"]
             # If not auto mode we take the configuration
             if not auto_mode:
@@ -212,7 +272,7 @@ def make_query_transform_node(llm,k_final=15):
             for index,index_sources in ROUTING_INDEX.items():
                 selected_sources = list(set(sources).intersection(index_sources))
                 if len(selected_sources) > 0:
-                    new_questions.append({"question":question,"sources":selected_sources,"index":index})
         # # Add the number of questions to search
         # k_by_question = k_final // len(new_questions)
@@ -222,11 +282,16 @@ def make_query_transform_node(llm,k_final=15):
         # new_state["questions"] = new_questions
         # new_state["remaining_questions"] = new_questions
         new_state = {
             "questions_list":new_questions,
-            "n_questions":len(new_questions),
-            "handled_questions_index":[],
         }
         return new_state

 ROUTING_INDEX = {
+    "IPx":["IPCC", "IPBS", "IPOS"],
+    "POC": ["AcclimaTerra", "PCAET","Biodiv"],
     "OpenAlex":["OpenAlex"],
 }
     country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
     location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
+class QueryTranslation(BaseModel):
+    """Translate the query into a given language"""
+    question : str = Field(
+        description="""
+        Translate the questions into the given language
+        If the question is alrealdy in the given language, just return the same question
+        """,
+    )
 class QueryAnalysis(BaseModel):
     """
     Analyze the user query to extract the relevant sources
     Also provide simple keywords to feed a search engine
     """
+    sources: List[Literal["IPCC", "IPBES", "IPOS", "AcclimaTerra", "PCAET","Biodiv"]] = Field( #,"OpenAlex"]] = Field(
         ...,
         description="""
             Given a user question choose which documents would be most relevant for answering their question,
             - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
             - IPBES is for questions about biodiversity and nature
             - IPOS is for questions about the ocean and deep sea mining
+            - AcclimaTerra is for questions about any specific place in, or close to, the french region "Nouvelle-Aquitaine"
+            - PCAET is the Plan Climat Eneregie Territorial for the city of Paris
+            - Biodiv is the Biodiversity plan for the city of Paris
         """,
     )
     prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant, you will analyze the user input message using the function provided"),
+        ("user", "input: {input}")
+    ])
+    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
+    return chain
+def make_query_translation_chain(llm):
+    """Analyze the user query to extract the relevant sources"""
+    openai_functions = [convert_to_openai_function(QueryTranslation)]
+    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryTranslation"})
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant, translate the question into {language}"),
         ("user", "input: {input}")
     ])
     chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
     return chain
+def group_by_sources_types(sources):
+    sources_types = {}
+    IPx_sources = ["IPCC", "IPBES", "IPOS"]
+    local_sources = ["AcclimaTerra", "PCAET","Biodiv"]
+    if any(source in IPx_sources for source in sources):
+        sources_types["IPx"] = list(set(sources).intersection(IPx_sources))
+    if any(source in local_sources for source in sources):
+        sources_types["POC"] = list(set(sources).intersection(local_sources))
+    return sources_types
 def make_query_transform_node(llm,k_final=15):
     """
     decomposition_chain = make_query_decomposition_chain(llm)
     query_analysis_chain = make_query_analysis_chain(llm)
+    query_translation_chain = make_query_translation_chain(llm)
     def transform_query(state):
         print("---- Transform query ----")
+        auto_mode = state.get("sources_auto", True)
+        sources_input = state.get("sources_input", ROUTING_INDEX["IPx"])
         new_state = {}
         decomposition_output = decomposition_chain.invoke({"input":state["query"]})
         new_state.update(decomposition_output)
         # Query Analysis
         questions = []
         for question in new_state["questions"]:
             # TODO WARNING llm should always return smthg
             # The case when the llm does not return any sources or wrong ouput
+            if not query_analysis_output["sources"] or not all(source in ["IPCC", "IPBS", "IPOS","AcclimaTerra", "PCAET","Biodiv"] for source in query_analysis_output["sources"]):
                 query_analysis_output["sources"] = ["IPCC", "IPBES", "IPOS"]
+            sources_types = group_by_sources_types(query_analysis_output["sources"])
+            for source_type,sources in sources_types.items():
+                question_state = {
+                    "question":question,
+                    "sources":sources,
+                    "source_type":source_type
+                }
+                questions.append(question_state)
+        # Translate question into the document language
+        for q in questions:
+            if q["source_type"]=="IPx":
+                translation_output = query_translation_chain.invoke({"input":q["question"],"language":"English"})
+                q["question"] = translation_output["question"]
+            elif q["source_type"]=="POC":
+                translation_output = query_translation_chain.invoke({"input":q["question"],"language":"French"})
+                q["question"] = translation_output["question"]
         # Explode the questions into multiple questions with different sources
         new_questions = []
         for q in questions:
+            question,sources,source_type = q["question"],q["sources"], q["source_type"]
             # If not auto mode we take the configuration
             if not auto_mode:
             for index,index_sources in ROUTING_INDEX.items():
                 selected_sources = list(set(sources).intersection(index_sources))
                 if len(selected_sources) > 0:
+                    new_questions.append({"question":question,"sources":selected_sources,"index":index, "source_type":source_type})
         # # Add the number of questions to search
         # k_by_question = k_final // len(new_questions)
         # new_state["questions"] = new_questions
         # new_state["remaining_questions"] = new_questions
+        n_questions = {
+            "total":len(new_questions),
+            "IPx":len([q for q in new_questions if q["index"] == "IPx"]),
+            "POC":len([q for q in new_questions if q["index"] == "POC"]),
+        }
         new_state = {
             "questions_list":new_questions,
+            "n_questions":n_questions,
+            "handled_questions_index":[],
         }
         return new_state

climateqa/engine/chains/retrieve_documents.py CHANGED Viewed

@@ -290,7 +290,7 @@ async def retrieve_documents(state,config, source_type, vectorstore,reranker,llm
     Returns:
         dict: The updated state containing the retrieved and reranked documents, related content, and remaining questions.
     """
-    print(f"---- Retrieve documents from {source_type}----")
     docs = state.get("documents", [])
     related_content = state.get("related_content", [])
@@ -304,26 +304,30 @@ async def retrieve_documents(state,config, source_type, vectorstore,reranker,llm
     # remaining_questions = state["remaining_questions"][1:]
     current_question_id = None
-    print("Here", range(len(state["questions_list"])),state["handled_questions_index"])
     for i in range(len(state["questions_list"])):
-        if i not in state["handled_questions_index"]:
             current_question_id = i
             break
-    current_question = state["questions_list"][current_question_id]
     # TODO filter on source_type
-    k_by_question = k_final // state["n_questions"]
-    k_summary_by_question = _get_k_summary_by_question(state["n_questions"])
-    k_images_by_question = _get_k_images_by_question(state["n_questions"])
     sources = current_question["sources"]
     question = current_question["question"]
     index = current_question["index"]
     print(f"Retrieve documents for question: {question}")
     await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
     # if index == "Vector": # always true for now #TODO rename to IPx
     if source_type == "IPx": # always true for now #TODO rename to IPx
@@ -393,7 +397,7 @@ def make_IPx_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_
     @chain
     async def retrieve_IPx_docs(state, config):
         source_type = "IPx"
-        return {"documents":[], "related_contents": [], "handled_questions_index": list(range(len(state["questions_list"])))} # TODO Remove
         state =  await retrieve_documents(
             state = state,

     Returns:
         dict: The updated state containing the retrieved and reranked documents, related content, and remaining questions.
     """
+    # TODO split les questions selon le type de sources dans le state question + conditions sur le nombre de questions traités par type de source
     docs = state.get("documents", [])
     related_content = state.get("related_content", [])
     # remaining_questions = state["remaining_questions"][1:]
     current_question_id = None
+    print("Questions Indexs", list(range(len(state["questions_list"]))), "- Handled questions : " ,state["handled_questions_index"])
     for i in range(len(state["questions_list"])):
+        current_question = state["questions_list"][i]
+        if i not in state["handled_questions_index"] and current_question["source_type"] == source_type:
             current_question_id = i
             break
     # TODO filter on source_type
+    k_by_question = k_final // state["n_questions"]["total"]
+    k_summary_by_question = _get_k_summary_by_question(state["n_questions"]["total"])
+    k_images_by_question = _get_k_images_by_question(state["n_questions"]["total"])
     sources = current_question["sources"]
     question = current_question["question"]
     index = current_question["index"]
+    source_type = current_question["source_type"]
     print(f"Retrieve documents for question: {question}")
     await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
+    print(f"""---- Retrieve documents from {current_question["source_type"]}----""")
     # if index == "Vector": # always true for now #TODO rename to IPx
     if source_type == "IPx": # always true for now #TODO rename to IPx
     @chain
     async def retrieve_IPx_docs(state, config):
         source_type = "IPx"
+        # return {"documents":[], "related_contents": [], "handled_questions_index": list(range(len(state["questions_list"])))} # TODO Remove
         state =  await retrieve_documents(
             state = state,

climateqa/engine/graph.py CHANGED Viewed

@@ -93,21 +93,40 @@ def route_based_on_relevant_docs(state,threshold_docs=0.2):
         return "answer_rag_no_docs"
 def route_continue_retrieve_documents(state):
-    if len(state["questions_list"]) == len(state["handled_questions_index"]) and state["search_only"] :
         return END
-    elif len(state["questions_list"]) == len(state["handled_questions_index"]):
-        return "answer_search"
-    else :
         return "retrieve_documents"
 def route_continue_retrieve_local_documents(state):
-    if len(state["questions_list"]) == len(state["handled_questions_index"]) and state["search_only"] :
         return END
-    elif len(state["questions_list"]) == len(state["handled_questions_index"]):
         return "answer_search"
-    else :
         return "retrieve_local_data"
     # if len(state["remaining_questions"]) == 0 and state["search_only"] :
         # return END
     # elif len(state["remaining_questions"]) > 0:
@@ -216,8 +235,8 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, vectorstore_regi
     # Define the edges
     workflow.add_edge("translate_query", "transform_query")
-    # workflow.add_edge("transform_query", "retrieve_documents") #TODO put back
-    workflow.add_edge("transform_query", END) # TODO remove
     workflow.add_edge("retrieve_graphs", END)
     workflow.add_edge("answer_rag", END)

         return "answer_rag_no_docs"
 def route_continue_retrieve_documents(state):
+    index_question_ipx = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "IPx"]
+    questions_ipx_finished = all(elem in state["handled_questions_index"] for elem in index_question_ipx)
+    if questions_ipx_finished and state["search_only"]:
         return END
+    elif questions_ipx_finished:
+        return "answer_search"
+    else:
         return "retrieve_documents"
+    # if state["n_questions"]["IPx"] == len(state["handled_questions_index"]) and state["search_only"] :
+    #     return END
+    # elif state["n_questions"]["IPx"] == len(state["handled_questions_index"]):
+    #     return "answer_search"
+    # else :
+    #     return "retrieve_documents"
 def route_continue_retrieve_local_documents(state):
+    index_question_poc = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "POC"]
+    questions_poc_finished = all(elem in state["handled_questions_index"] for elem in index_question_poc)
+    if questions_poc_finished and state["search_only"]:
         return END
+    elif questions_poc_finished:
         return "answer_search"
+    else:
         return "retrieve_local_data"
+    # if state["n_questions"]["POC"] == len(state["handled_questions_index"]) and state["search_only"] :
+    #     return END
+    # elif state["n_questions"]["POC"] == len(state["handled_questions_index"]):
+    #     return "answer_search"
+    # else :
+    #     return "retrieve_local_data"
     # if len(state["remaining_questions"]) == 0 and state["search_only"] :
         # return END
     # elif len(state["remaining_questions"]) > 0:
     # Define the edges
     workflow.add_edge("translate_query", "transform_query")
+    workflow.add_edge("transform_query", "retrieve_documents") #TODO put back
+    # workflow.add_edge("transform_query", END) # TODO remove
     workflow.add_edge("retrieve_graphs", END)
     workflow.add_edge("answer_rag", END)

climateqa/event_handler.py CHANGED Viewed

@@ -35,7 +35,7 @@ def handle_retrieved_documents(event: StreamEvent, history : list[ChatMessage],
         tuple[str, list[ChatMessage], list[str]]: The updated HTML representation of the documents, the updated message history and the updated list of used documents
     """
     if "documents" not in event["data"]["output"] or event["data"]["output"]["documents"] == []:
-        return history, used_documents, []
     try:
         docs = event["data"]["output"]["documents"]

         tuple[str, list[ChatMessage], list[str]]: The updated HTML representation of the documents, the updated message history and the updated list of used documents
     """
     if "documents" not in event["data"]["output"] or event["data"]["output"]["documents"] == []:
+        return history, used_documents
     try:
         docs = event["data"]["output"]["documents"]