📓 LangChain Quickstart¶
In this quickstart you will create a simple LCEL Chain and learn how to log it and get feedback on an LLM response.
For evaluation, we will leverage the RAG triad of groundedness, context relevance and answer relevance.
You'll also learn how to use feedbacks for guardrails, via filtering retrieved context.
# ! pip install trulens_eval openai langchain langchain-openai langchain_community faiss-cpu bs4 tiktoken
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
Import from LangChain and TruLens¶
# Imports main tools:
from trulens_eval import TruChain, Tru
tru = Tru()
tru.reset_database()
# Imports from LangChain to build app
import bs4
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
/opt/anaconda3/envs/snowday/lib/python3.11/site-packages/_distutils_hack/__init__.py:26: UserWarning: Setuptools is replacing distutils. warnings.warn("Setuptools is replacing distutils.")
🦑 Tru initialized with db url sqlite:///default.sqlite . 🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.
Load documents¶
loader = WebBaseLoader(
web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
bs_kwargs=dict(
parse_only=bs4.SoupStrainer(
class_=("post-content", "post-title", "post-header")
)
),
)
docs = loader.load()
Create Vector Store¶
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)
vectorstore = FAISS.from_documents(documents, embeddings)
Create RAG¶
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
/opt/anaconda3/envs/snowday/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The class `ChatOpenAI` was deprecated in LangChain 0.0.10 and will be removed in 0.3.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import ChatOpenAI`. warn_deprecated(
Send your first request¶
rag_chain.invoke("What is Task Decomposition?")
'Task Decomposition is a technique that breaks down complex tasks into smaller and simpler steps to enhance model performance. It involves transforming big tasks into manageable tasks by decomposing them into multiple steps. Task decomposition can be achieved through various methods, such as using simple prompting, task-specific instructions, or relying on external classical planners.'
Initialize Feedback Function(s)¶
from trulens_eval.feedback.provider import OpenAI
from trulens_eval import Feedback
import numpy as np
# Initialize provider class
provider = OpenAI()
# select context to be used in feedback. the location of context is app specific.
from trulens_eval.app import App
context = App.select_context(rag_chain)
# Define a groundedness feedback function
f_groundedness = (
Feedback(provider.groundedness_measure_with_cot_reasons, name = "Groundedness")
.on(context.collect()) # collect context chunks into a list
.on_output()
)
# Question/answer relevance between overall question and answer.
f_answer_relevance = (
Feedback(provider.relevance_with_cot_reasons, name = "Answer Relevance")
.on_input_output()
)
# Context relevance between question and each context chunk.
f_context_relevance = (
Feedback(provider.context_relevance_with_cot_reasons, name = "Context Relevance")
.on_input()
.on(context)
.aggregate(np.mean)
)
✅ In Groundedness, input source will be set to __record__.app.first.steps__.context.first.invoke.rets[:].page_content.collect() . ✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` . ✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` . ✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` . ✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` . ✅ In Context Relevance, input context will be set to __record__.app.first.steps__.context.first.invoke.rets[:].page_content .
Instrument chain for logging with TruLens¶
tru_recorder = TruChain(rag_chain,
app_id='Chain1_ChatApplication',
feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness])
with tru_recorder as recording:
llm_response = rag_chain.invoke("What is Task Decomposition?")
display(llm_response)
'Task Decomposition is a technique that breaks down complex tasks into smaller and simpler steps to enhance model performance. It involves transforming big tasks into manageable tasks by decomposing them into multiple steps. Task decomposition can be achieved through various methods, such as using simple prompting, task-specific instructions, or relying on external classical planners.'
Check results
tru.get_leaderboard()
Answer Relevance | Groundedness | Context Relevance | latency | total_cost | |
---|---|---|---|---|---|
app_id | |||||
Chain1_ChatApplication | 0.9 | 1.0 | 0.6 | 2.0 | 0.004985 |
By looking closer at context relevance, we see that our retriever is returning irrelevant context.
last_record = recording.records[-1]
from trulens_eval.utils.display import get_feedback_result
get_feedback_result(last_record, "Context Relevance")
question | context | ret | |
---|---|---|---|
0 | What is Task Decomposition? | Fig. 1. Overview of a LLM-powered autonomous a... | 0.8 |
1 | What is Task Decomposition? | Fig. 10. A picture of a sea otter using rock t... | 0.4 |
2 | What is Task Decomposition? | (3) Task execution: Expert models execute on t... | 0.4 |
3 | What is Task Decomposition? | Fig. 6. Illustration of how Algorithm Distilla... | 0.8 |
Use guardrails¶
In addition to making informed iteration, we can also directly use feedback results as guardrails at inference time. In particular, here we show how to use the context relevance score as a guardrail to filter out irrelevant context before it gets passed to the LLM. This both reduces hallucination and improves efficiency.
Below, you can see the TruLens feedback display of each context relevance chunk retrieved by our RAG.
Wouldn't it be great if we could automatically filter out context chunks with relevance scores below 0.5?
We can do so with the TruLens guardrail, WithFeedbackFilterDocuments. All we have to do is use the method of_retriever
to create a new filtered retriever, passing in the original retriever along with the feedback function and threshold we want to use.
from trulens_eval.guardrails.langchain import WithFeedbackFilterDocuments
# note: feedback function used for guardrail must only return a score, not also reasons
f_context_relevance_score = (
Feedback(provider.context_relevance)
)
filtered_retriever = WithFeedbackFilterDocuments.of_retriever(
retriever=retriever,
feedback=f_context_relevance_score,
threshold=0.75
)
rag_chain = (
{"context": filtered_retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
Then we can operate as normal
tru_recorder = TruChain(rag_chain,
app_id='Chain1_ChatApplication_Filtered',
feedbacks=[f_answer_relevance, f_context_relevance, f_groundedness])
with tru_recorder as recording:
llm_response = rag_chain.invoke("What is Task Decomposition?")
display(llm_response)
'Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.'
See the power of context filters!¶
If we inspect the context relevance of our retreival now, you see only relevant context chunks!
last_record = recording.records[-1]
from trulens_eval.utils.display import get_feedback_result
get_feedback_result(last_record, "Context Relevance")
question | context | ret | |
---|---|---|---|
0 | What is Task Decomposition? | Fig. 1. Overview of a LLM-powered autonomous a... | 0.8 |
tru.run_dashboard()
Starting dashboard ... Config file already exists. Skipping writing process. Credentials file already exists. Skipping writing process.
Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…
Dashboard started at http://192.168.4.206:1236 .
<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>
Retrieve records and feedback¶
# The record of the app invocation can be retrieved from the `recording`:
rec = recording.get() # use .get if only one record
# recs = recording.records # use .records if multiple
display(rec)
Record(record_id='record_hash_b58260de958b6a44f38e92067bd75220', app_id='Chain1_ChatApplication_Filtered', cost=Cost(n_requests=6, n_successful_requests=21, n_classes=0, n_tokens=15902, n_stream_chunks=0, n_prompt_tokens=15823, n_completion_tokens=79, cost=0.023838500000000002), perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 168333), end_time=datetime.datetime(2024, 7, 3, 5, 51, 13, 718205)), ts=datetime.datetime(2024, 7, 3, 5, 51, 13, 718234), tags='-', meta=None, main_input='What is Task Decomposition?', main_output='Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', main_error=None, calls=[RecordAppCall(call_id='201a07bb-6b2f-4616-ae75-0320ec1bc97b', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.question, method=Method(obj=Obj(cls=langchain_core.runnables.passthrough.RunnablePassthrough, id=6209785744, init_bindings=None), name='invoke'))], args={'input': 'What is Task Decomposition?', 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13729026896, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, rets='What is Task Decomposition?', error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 320329), end_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 340951)), pid=54862, tid=6435954), RecordAppCall(call_id='213e417f-c348-4f97-aae7-7be4a78594ba', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='_get_relevant_documents')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=langchain_core.vectorstores.VectorStoreRetriever, id=13728093840, init_bindings=None), name='_get_relevant_documents'))], args={'query': 'What is Task Decomposition?', 'run_manager': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManagerForRetrieverRun', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13728753936, 'init_bindings': None}}}, rets=[{'page_content': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}, {'page_content': 'Fig. 10. A picture of a sea otter using rock to crack open a seashell, while floating in the water. While some other animals can use tools, the complexity is not comparable with humans. (Image source: Animals using tools)\nMRKL (Karpas et al. 2022), short for “Modular Reasoning, Knowledge and Language”, is a neuro-symbolic architecture for autonomous agents. A MRKL system is proposed to contain a collection of “expert” modules and the general-purpose LLM works as a router to route inquiries to the best suitable expert module. These modules can be neural (e.g. deep learning models) or symbolic (e.g. math calculator, currency converter, weather API).\nThey did an experiment on fine-tuning LLM to call a calculator, using arithmetic as a test case. Their experiments showed that it was harder to solve verbal math problems than explicitly stated math problems because LLMs (7B Jurassic1-large model) failed to extract the right arguments for the basic arithmetic reliably. The results highlight when the external symbolic tools can work reliably, knowing when to and how to use the tools are crucial, determined by the LLM capability.\nBoth TALM (Tool Augmented Language Models; Parisi et al. 2022) and Toolformer (Schick et al. 2023) fine-tune a LM to learn to use external tool APIs. The dataset is expanded based on whether a newly added API call annotation can improve the quality of model outputs. See more details in the “External APIs” section of Prompt Engineering.\nChatGPT Plugins and OpenAI API function calling are good examples of LLMs augmented with tool use capability working in practice. The collection of tool APIs can be provided by other developers (as in Plugins) or self-defined (as in function calls).\nHuggingGPT (Shen et al. 2023) is a framework to use ChatGPT as the task planner to select models available in HuggingFace platform according to the model descriptions and summarize the response based on the execution results.\n\nFig. 11. Illustration of how HuggingGPT works. (Image source: Shen et al. 2023)\nThe system comprises of 4 stages:\n(1) Task planning: LLM works as the brain and parses the user requests into multiple tasks. There are four attributes associated with each task: task type, ID, dependencies, and arguments. They use few-shot examples to guide LLM to do task parsing and planning.\nInstruction:\n\nThe AI assistant can parse user input to several tasks: [{"task": task, "id", task_id, "dep": dependency_task_ids, "args": {"text": text, "image": URL, "audio": URL, "video": URL}}]. The "dep" field denotes the id of the previous task which generates a new resource that the current task relies on. A special tag "-task_id" refers to the generated text image, audio and video in the dependency task with id as task_id. The task MUST be selected from the following options: {{ Available Task List }}. There is a logical relationship between tasks, please note their order. If the user input can\'t be parsed, you need to reply empty JSON. Here are several cases for your reference: {{ Demonstrations }}. The chat history is recorded as {{ Chat History }}. From this chat history, you can find the path of the user-mentioned resources for your task planning.\n\n(2) Model selection: LLM distributes the tasks to expert models, where the request is framed as a multiple-choice question. LLM is presented with a list of models to choose from. Due to the limited context length, task type based filtration is needed.\nInstruction:\n\nGiven the user request and the call command, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The AI assistant merely outputs the model id of the most appropriate model. The output must be in a strict JSON format: "id": "id", "reason": "your detail reason for the choice". We have a list of models for you to choose from {{ Candidate Models }}. Please select one model from the list.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}, {'page_content': "(3) Task execution: Expert models execute on the specific tasks and log results.\nInstruction:\n\nWith the input and the inference results, the AI assistant needs to describe the process and results. The previous stages can be formed as - User Input: {{ User Input }}, Task Planning: {{ Tasks }}, Model Selection: {{ Model Assignment }}, Task Execution: {{ Predictions }}. You must first answer the user's request in a straightforward manner. Then describe the task process and show your analysis and model inference results to the user in the first person. If inference results contain a file path, must tell the user the complete file path.\n\n(4) Response generation: LLM receives the execution results and provides summarized results to users.\nTo put HuggingGPT into real world usage, a couple challenges need to solve: (1) Efficiency improvement is needed as both LLM inference rounds and interactions with other models slow down the process; (2) It relies on a long context window to communicate over complicated task content; (3) Stability improvement of LLM outputs and external model services.\nAPI-Bank (Li et al. 2023) is a benchmark for evaluating the performance of tool-augmented LLMs. It contains 53 commonly used API tools, a complete tool-augmented LLM workflow, and 264 annotated dialogues that involve 568 API calls. The selection of APIs is quite diverse, including search engines, calculator, calendar queries, smart home control, schedule management, health data management, account authentication workflow and more. Because there are a large number of APIs, LLM first has access to API search engine to find the right API to call and then uses the corresponding documentation to make a call.\n\nFig. 12. Pseudo code of how LLM makes an API call in API-Bank. (Image source: Li et al. 2023)\nIn the API-Bank workflow, LLMs need to make a couple of decisions and at each step we can evaluate how accurate that decision is. Decisions include:\n\nWhether an API call is needed.\nIdentify the right API to call: if not good enough, LLMs need to iteratively modify the API inputs (e.g. deciding search keywords for Search Engine API).\nResponse based on the API results: the model can choose to refine and call again if results are not satisfied.\n\nThis benchmark evaluates the agent’s tool use capabilities at three levels:\n\nLevel-1 evaluates the ability to call the API. Given an API’s description, the model needs to determine whether to call a given API, call it correctly, and respond properly to API returns.\nLevel-2 examines the ability to retrieve the API. The model needs to search for possible APIs that may solve the user’s requirement and learn how to use them by reading documentation.\nLevel-3 assesses the ability to plan API beyond retrieve and call. Given unclear user requests (e.g. schedule group meetings, book flight/hotel/restaurant for a trip), the model may have to conduct multiple API calls to solve it.\n\nCase Studies#\nScientific Discovery Agent#\nChemCrow (Bran et al. 2023) is a domain-specific example in which LLM is augmented with 13 expert-designed tools to accomplish tasks across organic synthesis, drug discovery, and materials design. The workflow, implemented in LangChain, reflects what was previously described in the ReAct and MRKLs and combines CoT reasoning with tools relevant to the tasks:\n\nThe LLM is provided with a list of tool names, descriptions of their utility, and details about the expected input/output.\nIt is then instructed to answer a user-given prompt using the tools provided when necessary. The instruction suggests the model to follow the ReAct format - Thought, Action, Action Input, Observation.", 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}, {'page_content': 'Fig. 6. Illustration of how Algorithm Distillation (AD) works. (Image source: Laskin et al. 2023).\nThe paper hypothesizes that any algorithm that generates a set of learning histories can be distilled into a neural network by performing behavioral cloning over actions. The history data is generated by a set of source policies, each trained for a specific task. At the training stage, during each RL run, a random task is sampled and a subsequence of multi-episode history is used for training, such that the learned policy is task-agnostic.\nIn reality, the model has limited context window length, so episodes should be short enough to construct multi-episode history. Multi-episodic contexts of 2-4 episodes are necessary to learn a near-optimal in-context RL algorithm. The emergence of in-context RL requires long enough context.\nIn comparison with three baselines, including ED (expert distillation, behavior cloning with expert trajectories instead of learning history), source policy (used for generating trajectories for distillation by UCB), RL^2 (Duan et al. 2017; used as upper bound since it needs online RL), AD demonstrates in-context RL with performance getting close to RL^2 despite only using offline RL and learns much faster than other baselines. When conditioned on partial training history of the source policy, AD also improves much faster than ED baseline.\n\nFig. 7. Comparison of AD, ED, source policy and RL^2 on environments that require memory and exploration. Only binary reward is assigned. The source policies are trained with A3C for "dark" environments and DQN for watermaze.(Image source: Laskin et al. 2023)\nComponent Two: Memory#\n(Big thank you to ChatGPT for helping me draft this section. I’ve learned a lot about the human brain and data structure for fast MIPS in my conversations with ChatGPT.)\nTypes of Memory#\nMemory can be defined as the processes used to acquire, store, retain, and later retrieve information. There are several types of memory in human brains.\n\n\nSensory Memory: This is the earliest stage of memory, providing the ability to retain impressions of sensory information (visual, auditory, etc) after the original stimuli have ended. Sensory memory typically only lasts for up to a few seconds. Subcategories include iconic memory (visual), echoic memory (auditory), and haptic memory (touch).\n\n\nShort-Term Memory (STM) or Working Memory: It stores information that we are currently aware of and needed to carry out complex cognitive tasks such as learning and reasoning. Short-term memory is believed to have the capacity of about 7 items (Miller 1956) and lasts for 20-30 seconds.\n\n\nLong-Term Memory (LTM): Long-term memory can store information for a remarkably long time, ranging from a few days to decades, with an essentially unlimited storage capacity. There are two subtypes of LTM:\n\nExplicit / declarative memory: This is memory of facts and events, and refers to those memories that can be consciously recalled, including episodic memory (events and experiences) and semantic memory (facts and concepts).\nImplicit / procedural memory: This type of memory is unconscious and involves skills and routines that are performed automatically, like riding a bike or typing on a keyboard.\n\n\n\n\nFig. 8. Categorization of human memory.\nWe can roughly consider the following mappings:\n\nSensory memory as learning embedding representations for raw inputs, including text, image or other modalities;\nShort-term memory as in-context learning. It is short and finite, as it is restricted by the finite context window length of Transformer.\nLong-term memory as the external vector store that the agent can attend to at query time, accessible via fast retrieval.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}], error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 359885), end_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 572243)), pid=54862, tid=6435953), RecordAppCall(call_id='15a79df3-d358-42b9-88c9-53ea69d4a05d', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='_get_relevant_documents')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first.feedback, method=Method(obj=Obj(cls=trulens_eval.feedback.feedback.Feedback, id=15142675120, init_bindings=None), name='__call__'))], args={'args': ['What is Task Decomposition?', 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.']}, rets=0.8, error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 579728), end_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 910319)), pid=54862, tid=6435956), RecordAppCall(call_id='97f870cf-22d6-4b09-8c35-3f8eeeb89272', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='_get_relevant_documents')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first.feedback, method=Method(obj=Obj(cls=trulens_eval.feedback.feedback.Feedback, id=15142675120, init_bindings=None), name='__call__'))], args={'args': ['What is Task Decomposition?', 'Fig. 10. A picture of a sea otter using rock to crack open a seashell, while floating in the water. While some other animals can use tools, the complexity is not comparable with humans. (Image source: Animals using tools)\nMRKL (Karpas et al. 2022), short for “Modular Reasoning, Knowledge and Language”, is a neuro-symbolic architecture for autonomous agents. A MRKL system is proposed to contain a collection of “expert” modules and the general-purpose LLM works as a router to route inquiries to the best suitable expert module. These modules can be neural (e.g. deep learning models) or symbolic (e.g. math calculator, currency converter, weather API).\nThey did an experiment on fine-tuning LLM to call a calculator, using arithmetic as a test case. Their experiments showed that it was harder to solve verbal math problems than explicitly stated math problems because LLMs (7B Jurassic1-large model) failed to extract the right arguments for the basic arithmetic reliably. The results highlight when the external symbolic tools can work reliably, knowing when to and how to use the tools are crucial, determined by the LLM capability.\nBoth TALM (Tool Augmented Language Models; Parisi et al. 2022) and Toolformer (Schick et al. 2023) fine-tune a LM to learn to use external tool APIs. The dataset is expanded based on whether a newly added API call annotation can improve the quality of model outputs. See more details in the “External APIs” section of Prompt Engineering.\nChatGPT Plugins and OpenAI API function calling are good examples of LLMs augmented with tool use capability working in practice. The collection of tool APIs can be provided by other developers (as in Plugins) or self-defined (as in function calls).\nHuggingGPT (Shen et al. 2023) is a framework to use ChatGPT as the task planner to select models available in HuggingFace platform according to the model descriptions and summarize the response based on the execution results.\n\nFig. 11. Illustration of how HuggingGPT works. (Image source: Shen et al. 2023)\nThe system comprises of 4 stages:\n(1) Task planning: LLM works as the brain and parses the user requests into multiple tasks. There are four attributes associated with each task: task type, ID, dependencies, and arguments. They use few-shot examples to guide LLM to do task parsing and planning.\nInstruction:\n\nThe AI assistant can parse user input to several tasks: [{"task": task, "id", task_id, "dep": dependency_task_ids, "args": {"text": text, "image": URL, "audio": URL, "video": URL}}]. The "dep" field denotes the id of the previous task which generates a new resource that the current task relies on. A special tag "-task_id" refers to the generated text image, audio and video in the dependency task with id as task_id. The task MUST be selected from the following options: {{ Available Task List }}. There is a logical relationship between tasks, please note their order. If the user input can\'t be parsed, you need to reply empty JSON. Here are several cases for your reference: {{ Demonstrations }}. The chat history is recorded as {{ Chat History }}. From this chat history, you can find the path of the user-mentioned resources for your task planning.\n\n(2) Model selection: LLM distributes the tasks to expert models, where the request is framed as a multiple-choice question. LLM is presented with a list of models to choose from. Due to the limited context length, task type based filtration is needed.\nInstruction:\n\nGiven the user request and the call command, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The AI assistant merely outputs the model id of the most appropriate model. The output must be in a strict JSON format: "id": "id", "reason": "your detail reason for the choice". We have a list of models for you to choose from {{ Candidate Models }}. Please select one model from the list.']}, rets=0.4, error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 583334), end_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 926111)), pid=54862, tid=6435957), RecordAppCall(call_id='ba9d8945-aedf-450d-b117-11e076671be5', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='_get_relevant_documents')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first.feedback, method=Method(obj=Obj(cls=trulens_eval.feedback.feedback.Feedback, id=15142675120, init_bindings=None), name='__call__'))], args={'args': ['What is Task Decomposition?', "(3) Task execution: Expert models execute on the specific tasks and log results.\nInstruction:\n\nWith the input and the inference results, the AI assistant needs to describe the process and results. The previous stages can be formed as - User Input: {{ User Input }}, Task Planning: {{ Tasks }}, Model Selection: {{ Model Assignment }}, Task Execution: {{ Predictions }}. You must first answer the user's request in a straightforward manner. Then describe the task process and show your analysis and model inference results to the user in the first person. If inference results contain a file path, must tell the user the complete file path.\n\n(4) Response generation: LLM receives the execution results and provides summarized results to users.\nTo put HuggingGPT into real world usage, a couple challenges need to solve: (1) Efficiency improvement is needed as both LLM inference rounds and interactions with other models slow down the process; (2) It relies on a long context window to communicate over complicated task content; (3) Stability improvement of LLM outputs and external model services.\nAPI-Bank (Li et al. 2023) is a benchmark for evaluating the performance of tool-augmented LLMs. It contains 53 commonly used API tools, a complete tool-augmented LLM workflow, and 264 annotated dialogues that involve 568 API calls. The selection of APIs is quite diverse, including search engines, calculator, calendar queries, smart home control, schedule management, health data management, account authentication workflow and more. Because there are a large number of APIs, LLM first has access to API search engine to find the right API to call and then uses the corresponding documentation to make a call.\n\nFig. 12. Pseudo code of how LLM makes an API call in API-Bank. (Image source: Li et al. 2023)\nIn the API-Bank workflow, LLMs need to make a couple of decisions and at each step we can evaluate how accurate that decision is. Decisions include:\n\nWhether an API call is needed.\nIdentify the right API to call: if not good enough, LLMs need to iteratively modify the API inputs (e.g. deciding search keywords for Search Engine API).\nResponse based on the API results: the model can choose to refine and call again if results are not satisfied.\n\nThis benchmark evaluates the agent’s tool use capabilities at three levels:\n\nLevel-1 evaluates the ability to call the API. Given an API’s description, the model needs to determine whether to call a given API, call it correctly, and respond properly to API returns.\nLevel-2 examines the ability to retrieve the API. The model needs to search for possible APIs that may solve the user’s requirement and learn how to use them by reading documentation.\nLevel-3 assesses the ability to plan API beyond retrieve and call. Given unclear user requests (e.g. schedule group meetings, book flight/hotel/restaurant for a trip), the model may have to conduct multiple API calls to solve it.\n\nCase Studies#\nScientific Discovery Agent#\nChemCrow (Bran et al. 2023) is a domain-specific example in which LLM is augmented with 13 expert-designed tools to accomplish tasks across organic synthesis, drug discovery, and materials design. The workflow, implemented in LangChain, reflects what was previously described in the ReAct and MRKLs and combines CoT reasoning with tools relevant to the tasks:\n\nThe LLM is provided with a list of tool names, descriptions of their utility, and details about the expected input/output.\nIt is then instructed to answer a user-given prompt using the tools provided when necessary. The instruction suggests the model to follow the ReAct format - Thought, Action, Action Input, Observation."]}, rets=0.7, error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 587395), end_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 962103)), pid=54862, tid=6435958), RecordAppCall(call_id='1d661575-b055-4a76-a521-93d75e128e7b', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='_get_relevant_documents')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first.feedback, method=Method(obj=Obj(cls=trulens_eval.feedback.feedback.Feedback, id=15142675120, init_bindings=None), name='__call__'))], args={'args': ['What is Task Decomposition?', 'Fig. 6. Illustration of how Algorithm Distillation (AD) works. (Image source: Laskin et al. 2023).\nThe paper hypothesizes that any algorithm that generates a set of learning histories can be distilled into a neural network by performing behavioral cloning over actions. The history data is generated by a set of source policies, each trained for a specific task. At the training stage, during each RL run, a random task is sampled and a subsequence of multi-episode history is used for training, such that the learned policy is task-agnostic.\nIn reality, the model has limited context window length, so episodes should be short enough to construct multi-episode history. Multi-episodic contexts of 2-4 episodes are necessary to learn a near-optimal in-context RL algorithm. The emergence of in-context RL requires long enough context.\nIn comparison with three baselines, including ED (expert distillation, behavior cloning with expert trajectories instead of learning history), source policy (used for generating trajectories for distillation by UCB), RL^2 (Duan et al. 2017; used as upper bound since it needs online RL), AD demonstrates in-context RL with performance getting close to RL^2 despite only using offline RL and learns much faster than other baselines. When conditioned on partial training history of the source policy, AD also improves much faster than ED baseline.\n\nFig. 7. Comparison of AD, ED, source policy and RL^2 on environments that require memory and exploration. Only binary reward is assigned. The source policies are trained with A3C for "dark" environments and DQN for watermaze.(Image source: Laskin et al. 2023)\nComponent Two: Memory#\n(Big thank you to ChatGPT for helping me draft this section. I’ve learned a lot about the human brain and data structure for fast MIPS in my conversations with ChatGPT.)\nTypes of Memory#\nMemory can be defined as the processes used to acquire, store, retain, and later retrieve information. There are several types of memory in human brains.\n\n\nSensory Memory: This is the earliest stage of memory, providing the ability to retain impressions of sensory information (visual, auditory, etc) after the original stimuli have ended. Sensory memory typically only lasts for up to a few seconds. Subcategories include iconic memory (visual), echoic memory (auditory), and haptic memory (touch).\n\n\nShort-Term Memory (STM) or Working Memory: It stores information that we are currently aware of and needed to carry out complex cognitive tasks such as learning and reasoning. Short-term memory is believed to have the capacity of about 7 items (Miller 1956) and lasts for 20-30 seconds.\n\n\nLong-Term Memory (LTM): Long-term memory can store information for a remarkably long time, ranging from a few days to decades, with an essentially unlimited storage capacity. There are two subtypes of LTM:\n\nExplicit / declarative memory: This is memory of facts and events, and refers to those memories that can be consciously recalled, including episodic memory (events and experiences) and semantic memory (facts and concepts).\nImplicit / procedural memory: This type of memory is unconscious and involves skills and routines that are performed automatically, like riding a bike or typing on a keyboard.\n\n\n\n\nFig. 8. Categorization of human memory.\nWe can roughly consider the following mappings:\n\nSensory memory as learning embedding representations for raw inputs, including text, image or other modalities;\nShort-term memory as in-context learning. It is short and finite, as it is restricted by the finite context window length of Transformer.\nLong-term memory as the external vector store that the agent can attend to at query time, accessible via fast retrieval.']}, rets=0.7, error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 588874), end_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 96782)), pid=54862, tid=6435959), RecordAppCall(call_id='116aef8b-819d-4c8f-97ef-7c7cf9bbf679', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='_get_relevant_documents'))], args={'query': 'What is Task Decomposition?', 'run_manager': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManagerForRetrieverRun', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13728753936, 'init_bindings': None}}}, rets=[{'page_content': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}], error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 344589), end_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 97110)), pid=54862, tid=6435953), RecordAppCall(call_id='d142197e-7eb5-4f9a-903b-709ee271b41a', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='invoke'))], args={'input': 'What is Task Decomposition?', 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 14945277584, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, rets=[{'page_content': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}], error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 323578), end_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 97315)), pid=54862, tid=6435953), RecordAppCall(call_id='a41025a7-e33a-41a3-8fb2-c5750cad6820', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke'))], args={'input': 'What is Task Decomposition?', 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 15143364304, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, rets='Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 298906), end_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 97865)), pid=54862, tid=6435953), RecordAppCall(call_id='fa15611c-5c96-42ed-a9bb-ab7128b4598d', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke'))], args={'input': 'What is Task Decomposition?', 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13728000272, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, rets={'context': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'question': 'What is Task Decomposition?'}, error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 241491), end_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 98091)), pid=54862, tid=6435330), RecordAppCall(call_id='b917c04f-9f13-4771-8c83-ea5bcd954f73', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.middle[0], method=Method(obj=Obj(cls=langchain_core.prompts.chat.ChatPromptTemplate, id=15129453584, init_bindings=None), name='invoke'))], args={'input': {'context': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'question': 'What is Task Decomposition?'}, 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13726364240, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, rets={'messages': [{'content': 'You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don\'t know the answer, just say that you don\'t know. Use three sentences maximum and keep the answer concise.\nQuestion: What is Task Decomposition? \nContext: Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results. \nAnswer:', 'additional_kwargs': {}, 'response_metadata': {}, 'type': 'human', 'name': None, 'id': None, 'example': False}]}, error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 178202), end_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 215342)), pid=54862, tid=6435330), RecordAppCall(call_id='72a5680b-5c89-47ba-8e2d-2c00bf28bb46', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.middle[1], method=Method(obj=Obj(cls=langchain_community.chat_models.openai.ChatOpenAI, id=15122144848, init_bindings=None), name='invoke'))], args={'input': {'messages': [{'content': 'You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don\'t know the answer, just say that you don\'t know. Use three sentences maximum and keep the answer concise.\nQuestion: What is Task Decomposition? \nContext: Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results. \nAnswer:', 'additional_kwargs': {}, 'response_metadata': {}, 'type': 'human', 'name': None, 'id': None, 'example': False}]}, 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 15145143184, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, rets={'content': 'Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', 'additional_kwargs': {}, 'response_metadata': {'token_usage': {'completion_tokens': 65, 'prompt_tokens': 836, 'total_tokens': 901}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, 'type': 'ai', 'name': None, 'id': 'run-05f3d6ee-fc3a-4a3f-aaf1-812eb7375cb9-0', 'example': False, 'tool_calls': [], 'invalid_tool_calls': [], 'usage_metadata': None}, error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 310321), end_time=datetime.datetime(2024, 7, 3, 5, 51, 13, 631697)), pid=54862, tid=6435330), RecordAppCall(call_id='7b1114dd-b7f8-4de4-993f-06dda4194d33', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.last, method=Method(obj=Obj(cls=langchain_core.output_parsers.string.StrOutputParser, id=13724810128, init_bindings=None), name='invoke'))], args={'input': {'content': 'Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', 'additional_kwargs': {}, 'response_metadata': {'token_usage': {'completion_tokens': 65, 'prompt_tokens': 836, 'total_tokens': 901}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, 'type': 'ai', 'name': None, 'id': 'run-05f3d6ee-fc3a-4a3f-aaf1-812eb7375cb9-0', 'example': False, 'tool_calls': [], 'invalid_tool_calls': [], 'usage_metadata': None}, 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13727989648, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, rets='Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 13, 690211), end_time=datetime.datetime(2024, 7, 3, 5, 51, 13, 718171)), pid=54862, tid=6435330), RecordAppCall(call_id='c74b4da5-6555-4647-ae00-115a614e701d', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke'))], args={'input': 'What is Task Decomposition?'}, rets='Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 168333), end_time=datetime.datetime(2024, 7, 3, 5, 51, 13, 718205)), pid=54862, tid=6435330)], feedback_and_future_results=[(FeedbackDefinition(Answer Relevance, selectors={'prompt': Lens().__record__.main_input, 'response': Lens().__record__.main_output}, if_exists=None ), <Future at 0x3854f7990 state=finished returned FeedbackResult>), (FeedbackDefinition(Context Relevance, selectors={'question': Lens().__record__.main_input, 'context': Lens().__record__.app.first.steps__.context.first.invoke.rets[:].page_content}, if_exists=None ), <Future at 0x386ba6b90 state=finished returned FeedbackResult>), (FeedbackDefinition(Groundedness, selectors={'source': Lens().__record__.app.first.steps__.context.first.invoke.rets[:].page_content.collect(), 'statement': Lens().__record__.main_output}, if_exists=None ), <Future at 0x386b55010 state=finished returned FeedbackResult>)], feedback_results=[<Future at 0x3854f7990 state=finished returned FeedbackResult>, <Future at 0x386ba6b90 state=finished returned FeedbackResult>, <Future at 0x386b55010 state=finished returned FeedbackResult>])
# The results of the feedback functions can be rertrieved from
# `Record.feedback_results` or using the `wait_for_feedback_result` method. The
# results if retrieved directly are `Future` instances (see
# `concurrent.futures`). You can use `as_completed` to wait until they have
# finished evaluating or use the utility method:
for feedback, feedback_result in rec.wait_for_feedback_results().items():
print(feedback.name, feedback_result.result)
# See more about wait_for_feedback_results:
# help(rec.wait_for_feedback_results)
Answer Relevance 0.9 Context Relevance 0.8 Groundedness 0.9666666666666667
records, feedback = tru.get_records_and_feedback(app_ids=[])
records.head()
app_id | app_json | type | record_id | input | output | tags | record_json | cost_json | perf_json | ts | Groundedness | Context Relevance | Answer Relevance | Groundedness_calls | Context Relevance_calls | Answer Relevance_calls | latency | total_tokens | total_cost | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Chain1_ChatApplication | {"tru_class_info": {"name": "TruChain", "modul... | RunnableSequence(langchain_core.runnables.base) | record_hash_402adfcfca139d116e6a0dd79b7813c8 | "What is Task Decomposition?" | "Task Decomposition is a technique that breaks... | - | {"record_id": "record_hash_402adfcfca139d116e6... | {"n_requests": 2, "n_successful_requests": 3, ... | {"start_time": "2024-07-03T05:50:39.251674", "... | 2024-07-03T05:50:41.272357 | 1.000000 | 0.6 | 0.9 | [{'args': {'source': ['Fig. 1. Overview of a L... | [{'args': {'question': 'What is Task Decomposi... | [{'args': {'prompt': 'What is Task Decompositi... | 2 | 3314 | 0.004985 |
1 | Chain1_ChatApplication_Filtered | {"tru_class_info": {"name": "TruChain", "modul... | RunnableSequence(langchain_core.runnables.base) | record_hash_b58260de958b6a44f38e92067bd75220 | "What is Task Decomposition?" | "Task decomposition is a technique used to bre... | - | {"record_id": "record_hash_b58260de958b6a44f38... | {"n_requests": 6, "n_successful_requests": 21,... | {"start_time": "2024-07-03T05:51:11.168333", "... | 2024-07-03T05:51:13.718234 | 0.966667 | 0.8 | 0.9 | [{'args': {'source': ['Fig. 1. Overview of a L... | [{'args': {'question': 'What is Task Decomposi... | [{'args': {'prompt': 'What is Task Decompositi... | 2 | 15902 | 0.023839 |
tru.get_leaderboard(app_ids=[])
Answer Relevance | Groundedness | Context Relevance | latency | total_cost | |
---|---|---|---|---|---|
app_id | |||||
Chain1_ChatApplication | 0.9 | 1.000000 | 0.6 | 2.0 | 0.004985 |
Chain1_ChatApplication_Filtered | 0.9 | 0.966667 | 0.8 | 2.0 | 0.023839 |
Explore in a Dashboard¶
tru.run_dashboard() # open a local streamlit app to explore
# tru.stop_dashboard() # stop if needed
Alternatively, you can run trulens-eval
from a command line in the same folder to start the dashboard.
Learn more about the call stack¶
json_like = last_record.layout_calls_as_app()
json_like
Munch({'record_id': 'record_hash_b58260de958b6a44f38e92067bd75220', 'app_id': 'Chain1_ChatApplication_Filtered', 'cost': {'n_requests': 6, 'n_successful_requests': 21, 'n_classes': 0, 'n_tokens': 15902, 'n_stream_chunks': 0, 'n_prompt_tokens': 15823, 'n_completion_tokens': 79, 'cost': 0.023838500000000002}, 'perf': {'start_time': '2024-07-03T05:51:11.168333', 'end_time': '2024-07-03T05:51:13.718205'}, 'ts': '2024-07-03T05:51:13.718234', 'tags': '-', 'meta': None, 'main_input': 'What is Task Decomposition?', 'main_output': 'Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', 'main_error': None, 'calls': [{'call_id': '201a07bb-6b2f-4616-ae75-0320ec1bc97b', 'stack': [{'path': 'app', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first', 'method': {'obj': {'cls': {'name': 'RunnableParallel', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 13726547408, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.question', 'method': {'obj': {'cls': {'name': 'RunnablePassthrough', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.passthrough'}, 'bases': None}, 'id': 6209785744, 'init_bindings': None}, 'name': 'invoke'}}], 'args': {'input': 'What is Task Decomposition?', 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13729026896, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, 'rets': 'What is Task Decomposition?', 'error': None, 'perf': {'start_time': '2024-07-03T05:51:11.320329', 'end_time': '2024-07-03T05:51:11.340951'}, 'pid': 54862, 'tid': 6435954}, {'call_id': '213e417f-c348-4f97-aae7-7be4a78594ba', 'stack': [{'path': 'app', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first', 'method': {'obj': {'cls': {'name': 'RunnableParallel', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 13726547408, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context.first', 'method': {'obj': {'cls': {'name': 'WithFeedbackFilterDocuments', 'module': {'package_name': 'trulens_eval.guardrails', 'module_name': 'trulens_eval.guardrails.langchain'}, 'bases': None}, 'id': 13728093840, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context.first', 'method': {'obj': {'cls': {'name': 'WithFeedbackFilterDocuments', 'module': {'package_name': 'trulens_eval.guardrails', 'module_name': 'trulens_eval.guardrails.langchain'}, 'bases': None}, 'id': 13728093840, 'init_bindings': None}, 'name': '_get_relevant_documents'}}, {'path': 'app.first.steps__.context.first', 'method': {'obj': {'cls': {'name': 'VectorStoreRetriever', 'module': {'package_name': 'langchain_core', 'module_name': 'langchain_core.vectorstores'}, 'bases': None}, 'id': 13728093840, 'init_bindings': None}, 'name': '_get_relevant_documents'}}], 'args': {'query': 'What is Task Decomposition?', 'run_manager': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManagerForRetrieverRun', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13728753936, 'init_bindings': None}}}, 'rets': [{'page_content': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}, {'page_content': 'Fig. 10. A picture of a sea otter using rock to crack open a seashell, while floating in the water. While some other animals can use tools, the complexity is not comparable with humans. (Image source: Animals using tools)\nMRKL (Karpas et al. 2022), short for “Modular Reasoning, Knowledge and Language”, is a neuro-symbolic architecture for autonomous agents. A MRKL system is proposed to contain a collection of “expert” modules and the general-purpose LLM works as a router to route inquiries to the best suitable expert module. These modules can be neural (e.g. deep learning models) or symbolic (e.g. math calculator, currency converter, weather API).\nThey did an experiment on fine-tuning LLM to call a calculator, using arithmetic as a test case. Their experiments showed that it was harder to solve verbal math problems than explicitly stated math problems because LLMs (7B Jurassic1-large model) failed to extract the right arguments for the basic arithmetic reliably. The results highlight when the external symbolic tools can work reliably, knowing when to and how to use the tools are crucial, determined by the LLM capability.\nBoth TALM (Tool Augmented Language Models; Parisi et al. 2022) and Toolformer (Schick et al. 2023) fine-tune a LM to learn to use external tool APIs. The dataset is expanded based on whether a newly added API call annotation can improve the quality of model outputs. See more details in the “External APIs” section of Prompt Engineering.\nChatGPT Plugins and OpenAI API function calling are good examples of LLMs augmented with tool use capability working in practice. The collection of tool APIs can be provided by other developers (as in Plugins) or self-defined (as in function calls).\nHuggingGPT (Shen et al. 2023) is a framework to use ChatGPT as the task planner to select models available in HuggingFace platform according to the model descriptions and summarize the response based on the execution results.\n\nFig. 11. Illustration of how HuggingGPT works. (Image source: Shen et al. 2023)\nThe system comprises of 4 stages:\n(1) Task planning: LLM works as the brain and parses the user requests into multiple tasks. There are four attributes associated with each task: task type, ID, dependencies, and arguments. They use few-shot examples to guide LLM to do task parsing and planning.\nInstruction:\n\nThe AI assistant can parse user input to several tasks: [{"task": task, "id", task_id, "dep": dependency_task_ids, "args": {"text": text, "image": URL, "audio": URL, "video": URL}}]. The "dep" field denotes the id of the previous task which generates a new resource that the current task relies on. A special tag "-task_id" refers to the generated text image, audio and video in the dependency task with id as task_id. The task MUST be selected from the following options: {{ Available Task List }}. There is a logical relationship between tasks, please note their order. If the user input can\'t be parsed, you need to reply empty JSON. Here are several cases for your reference: {{ Demonstrations }}. The chat history is recorded as {{ Chat History }}. From this chat history, you can find the path of the user-mentioned resources for your task planning.\n\n(2) Model selection: LLM distributes the tasks to expert models, where the request is framed as a multiple-choice question. LLM is presented with a list of models to choose from. Due to the limited context length, task type based filtration is needed.\nInstruction:\n\nGiven the user request and the call command, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The AI assistant merely outputs the model id of the most appropriate model. The output must be in a strict JSON format: "id": "id", "reason": "your detail reason for the choice". We have a list of models for you to choose from {{ Candidate Models }}. Please select one model from the list.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}, {'page_content': "(3) Task execution: Expert models execute on the specific tasks and log results.\nInstruction:\n\nWith the input and the inference results, the AI assistant needs to describe the process and results. The previous stages can be formed as - User Input: {{ User Input }}, Task Planning: {{ Tasks }}, Model Selection: {{ Model Assignment }}, Task Execution: {{ Predictions }}. You must first answer the user's request in a straightforward manner. Then describe the task process and show your analysis and model inference results to the user in the first person. If inference results contain a file path, must tell the user the complete file path.\n\n(4) Response generation: LLM receives the execution results and provides summarized results to users.\nTo put HuggingGPT into real world usage, a couple challenges need to solve: (1) Efficiency improvement is needed as both LLM inference rounds and interactions with other models slow down the process; (2) It relies on a long context window to communicate over complicated task content; (3) Stability improvement of LLM outputs and external model services.\nAPI-Bank (Li et al. 2023) is a benchmark for evaluating the performance of tool-augmented LLMs. It contains 53 commonly used API tools, a complete tool-augmented LLM workflow, and 264 annotated dialogues that involve 568 API calls. The selection of APIs is quite diverse, including search engines, calculator, calendar queries, smart home control, schedule management, health data management, account authentication workflow and more. Because there are a large number of APIs, LLM first has access to API search engine to find the right API to call and then uses the corresponding documentation to make a call.\n\nFig. 12. Pseudo code of how LLM makes an API call in API-Bank. (Image source: Li et al. 2023)\nIn the API-Bank workflow, LLMs need to make a couple of decisions and at each step we can evaluate how accurate that decision is. Decisions include:\n\nWhether an API call is needed.\nIdentify the right API to call: if not good enough, LLMs need to iteratively modify the API inputs (e.g. deciding search keywords for Search Engine API).\nResponse based on the API results: the model can choose to refine and call again if results are not satisfied.\n\nThis benchmark evaluates the agent’s tool use capabilities at three levels:\n\nLevel-1 evaluates the ability to call the API. Given an API’s description, the model needs to determine whether to call a given API, call it correctly, and respond properly to API returns.\nLevel-2 examines the ability to retrieve the API. The model needs to search for possible APIs that may solve the user’s requirement and learn how to use them by reading documentation.\nLevel-3 assesses the ability to plan API beyond retrieve and call. Given unclear user requests (e.g. schedule group meetings, book flight/hotel/restaurant for a trip), the model may have to conduct multiple API calls to solve it.\n\nCase Studies#\nScientific Discovery Agent#\nChemCrow (Bran et al. 2023) is a domain-specific example in which LLM is augmented with 13 expert-designed tools to accomplish tasks across organic synthesis, drug discovery, and materials design. The workflow, implemented in LangChain, reflects what was previously described in the ReAct and MRKLs and combines CoT reasoning with tools relevant to the tasks:\n\nThe LLM is provided with a list of tool names, descriptions of their utility, and details about the expected input/output.\nIt is then instructed to answer a user-given prompt using the tools provided when necessary. The instruction suggests the model to follow the ReAct format - Thought, Action, Action Input, Observation.", 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}, {'page_content': 'Fig. 6. Illustration of how Algorithm Distillation (AD) works. (Image source: Laskin et al. 2023).\nThe paper hypothesizes that any algorithm that generates a set of learning histories can be distilled into a neural network by performing behavioral cloning over actions. The history data is generated by a set of source policies, each trained for a specific task. At the training stage, during each RL run, a random task is sampled and a subsequence of multi-episode history is used for training, such that the learned policy is task-agnostic.\nIn reality, the model has limited context window length, so episodes should be short enough to construct multi-episode history. Multi-episodic contexts of 2-4 episodes are necessary to learn a near-optimal in-context RL algorithm. The emergence of in-context RL requires long enough context.\nIn comparison with three baselines, including ED (expert distillation, behavior cloning with expert trajectories instead of learning history), source policy (used for generating trajectories for distillation by UCB), RL^2 (Duan et al. 2017; used as upper bound since it needs online RL), AD demonstrates in-context RL with performance getting close to RL^2 despite only using offline RL and learns much faster than other baselines. When conditioned on partial training history of the source policy, AD also improves much faster than ED baseline.\n\nFig. 7. Comparison of AD, ED, source policy and RL^2 on environments that require memory and exploration. Only binary reward is assigned. The source policies are trained with A3C for "dark" environments and DQN for watermaze.(Image source: Laskin et al. 2023)\nComponent Two: Memory#\n(Big thank you to ChatGPT for helping me draft this section. I’ve learned a lot about the human brain and data structure for fast MIPS in my conversations with ChatGPT.)\nTypes of Memory#\nMemory can be defined as the processes used to acquire, store, retain, and later retrieve information. There are several types of memory in human brains.\n\n\nSensory Memory: This is the earliest stage of memory, providing the ability to retain impressions of sensory information (visual, auditory, etc) after the original stimuli have ended. Sensory memory typically only lasts for up to a few seconds. Subcategories include iconic memory (visual), echoic memory (auditory), and haptic memory (touch).\n\n\nShort-Term Memory (STM) or Working Memory: It stores information that we are currently aware of and needed to carry out complex cognitive tasks such as learning and reasoning. Short-term memory is believed to have the capacity of about 7 items (Miller 1956) and lasts for 20-30 seconds.\n\n\nLong-Term Memory (LTM): Long-term memory can store information for a remarkably long time, ranging from a few days to decades, with an essentially unlimited storage capacity. There are two subtypes of LTM:\n\nExplicit / declarative memory: This is memory of facts and events, and refers to those memories that can be consciously recalled, including episodic memory (events and experiences) and semantic memory (facts and concepts).\nImplicit / procedural memory: This type of memory is unconscious and involves skills and routines that are performed automatically, like riding a bike or typing on a keyboard.\n\n\n\n\nFig. 8. Categorization of human memory.\nWe can roughly consider the following mappings:\n\nSensory memory as learning embedding representations for raw inputs, including text, image or other modalities;\nShort-term memory as in-context learning. It is short and finite, as it is restricted by the finite context window length of Transformer.\nLong-term memory as the external vector store that the agent can attend to at query time, accessible via fast retrieval.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}], 'error': None, 'perf': {'start_time': '2024-07-03T05:51:11.359885', 'end_time': '2024-07-03T05:51:11.572243'}, 'pid': 54862, 'tid': 6435953}, {'call_id': '15a79df3-d358-42b9-88c9-53ea69d4a05d', 'stack': [{'path': 'app', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first', 'method': {'obj': {'cls': {'name': 'RunnableParallel', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 13726547408, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context.first', 'method': {'obj': {'cls': {'name': 'WithFeedbackFilterDocuments', 'module': {'package_name': 'trulens_eval.guardrails', 'module_name': 'trulens_eval.guardrails.langchain'}, 'bases': None}, 'id': 13728093840, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context.first', 'method': {'obj': {'cls': {'name': 'WithFeedbackFilterDocuments', 'module': {'package_name': 'trulens_eval.guardrails', 'module_name': 'trulens_eval.guardrails.langchain'}, 'bases': None}, 'id': 13728093840, 'init_bindings': None}, 'name': '_get_relevant_documents'}}, {'path': 'app.first.steps__.context.first.feedback', 'method': {'obj': {'cls': {'name': 'Feedback', 'module': {'package_name': 'trulens_eval.feedback', 'module_name': 'trulens_eval.feedback.feedback'}, 'bases': None}, 'id': 15142675120, 'init_bindings': None}, 'name': '__call__'}}], 'args': {'args': ['What is Task Decomposition?', 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.']}, 'rets': 0.8, 'error': None, 'perf': {'start_time': '2024-07-03T05:51:11.579728', 'end_time': '2024-07-03T05:51:11.910319'}, 'pid': 54862, 'tid': 6435956}, {'call_id': '97f870cf-22d6-4b09-8c35-3f8eeeb89272', 'stack': [{'path': 'app', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first', 'method': {'obj': {'cls': {'name': 'RunnableParallel', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 13726547408, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context.first', 'method': {'obj': {'cls': {'name': 'WithFeedbackFilterDocuments', 'module': {'package_name': 'trulens_eval.guardrails', 'module_name': 'trulens_eval.guardrails.langchain'}, 'bases': None}, 'id': 13728093840, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context.first', 'method': {'obj': {'cls': {'name': 'WithFeedbackFilterDocuments', 'module': {'package_name': 'trulens_eval.guardrails', 'module_name': 'trulens_eval.guardrails.langchain'}, 'bases': None}, 'id': 13728093840, 'init_bindings': None}, 'name': '_get_relevant_documents'}}, {'path': 'app.first.steps__.context.first.feedback', 'method': {'obj': {'cls': {'name': 'Feedback', 'module': {'package_name': 'trulens_eval.feedback', 'module_name': 'trulens_eval.feedback.feedback'}, 'bases': None}, 'id': 15142675120, 'init_bindings': None}, 'name': '__call__'}}], 'args': {'args': ['What is Task Decomposition?', 'Fig. 10. A picture of a sea otter using rock to crack open a seashell, while floating in the water. While some other animals can use tools, the complexity is not comparable with humans. (Image source: Animals using tools)\nMRKL (Karpas et al. 2022), short for “Modular Reasoning, Knowledge and Language”, is a neuro-symbolic architecture for autonomous agents. A MRKL system is proposed to contain a collection of “expert” modules and the general-purpose LLM works as a router to route inquiries to the best suitable expert module. These modules can be neural (e.g. deep learning models) or symbolic (e.g. math calculator, currency converter, weather API).\nThey did an experiment on fine-tuning LLM to call a calculator, using arithmetic as a test case. Their experiments showed that it was harder to solve verbal math problems than explicitly stated math problems because LLMs (7B Jurassic1-large model) failed to extract the right arguments for the basic arithmetic reliably. The results highlight when the external symbolic tools can work reliably, knowing when to and how to use the tools are crucial, determined by the LLM capability.\nBoth TALM (Tool Augmented Language Models; Parisi et al. 2022) and Toolformer (Schick et al. 2023) fine-tune a LM to learn to use external tool APIs. The dataset is expanded based on whether a newly added API call annotation can improve the quality of model outputs. See more details in the “External APIs” section of Prompt Engineering.\nChatGPT Plugins and OpenAI API function calling are good examples of LLMs augmented with tool use capability working in practice. The collection of tool APIs can be provided by other developers (as in Plugins) or self-defined (as in function calls).\nHuggingGPT (Shen et al. 2023) is a framework to use ChatGPT as the task planner to select models available in HuggingFace platform according to the model descriptions and summarize the response based on the execution results.\n\nFig. 11. Illustration of how HuggingGPT works. (Image source: Shen et al. 2023)\nThe system comprises of 4 stages:\n(1) Task planning: LLM works as the brain and parses the user requests into multiple tasks. There are four attributes associated with each task: task type, ID, dependencies, and arguments. They use few-shot examples to guide LLM to do task parsing and planning.\nInstruction:\n\nThe AI assistant can parse user input to several tasks: [{"task": task, "id", task_id, "dep": dependency_task_ids, "args": {"text": text, "image": URL, "audio": URL, "video": URL}}]. The "dep" field denotes the id of the previous task which generates a new resource that the current task relies on. A special tag "-task_id" refers to the generated text image, audio and video in the dependency task with id as task_id. The task MUST be selected from the following options: {{ Available Task List }}. There is a logical relationship between tasks, please note their order. If the user input can\'t be parsed, you need to reply empty JSON. Here are several cases for your reference: {{ Demonstrations }}. The chat history is recorded as {{ Chat History }}. From this chat history, you can find the path of the user-mentioned resources for your task planning.\n\n(2) Model selection: LLM distributes the tasks to expert models, where the request is framed as a multiple-choice question. LLM is presented with a list of models to choose from. Due to the limited context length, task type based filtration is needed.\nInstruction:\n\nGiven the user request and the call command, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The AI assistant merely outputs the model id of the most appropriate model. The output must be in a strict JSON format: "id": "id", "reason": "your detail reason for the choice". We have a list of models for you to choose from {{ Candidate Models }}. Please select one model from the list.']}, 'rets': 0.4, 'error': None, 'perf': {'start_time': '2024-07-03T05:51:11.583334', 'end_time': '2024-07-03T05:51:11.926111'}, 'pid': 54862, 'tid': 6435957}, {'call_id': 'ba9d8945-aedf-450d-b117-11e076671be5', 'stack': [{'path': 'app', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first', 'method': {'obj': {'cls': {'name': 'RunnableParallel', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 13726547408, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context.first', 'method': {'obj': {'cls': {'name': 'WithFeedbackFilterDocuments', 'module': {'package_name': 'trulens_eval.guardrails', 'module_name': 'trulens_eval.guardrails.langchain'}, 'bases': None}, 'id': 13728093840, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context.first', 'method': {'obj': {'cls': {'name': 'WithFeedbackFilterDocuments', 'module': {'package_name': 'trulens_eval.guardrails', 'module_name': 'trulens_eval.guardrails.langchain'}, 'bases': None}, 'id': 13728093840, 'init_bindings': None}, 'name': '_get_relevant_documents'}}, {'path': 'app.first.steps__.context.first.feedback', 'method': {'obj': {'cls': {'name': 'Feedback', 'module': {'package_name': 'trulens_eval.feedback', 'module_name': 'trulens_eval.feedback.feedback'}, 'bases': None}, 'id': 15142675120, 'init_bindings': None}, 'name': '__call__'}}], 'args': {'args': ['What is Task Decomposition?', "(3) Task execution: Expert models execute on the specific tasks and log results.\nInstruction:\n\nWith the input and the inference results, the AI assistant needs to describe the process and results. The previous stages can be formed as - User Input: {{ User Input }}, Task Planning: {{ Tasks }}, Model Selection: {{ Model Assignment }}, Task Execution: {{ Predictions }}. You must first answer the user's request in a straightforward manner. Then describe the task process and show your analysis and model inference results to the user in the first person. If inference results contain a file path, must tell the user the complete file path.\n\n(4) Response generation: LLM receives the execution results and provides summarized results to users.\nTo put HuggingGPT into real world usage, a couple challenges need to solve: (1) Efficiency improvement is needed as both LLM inference rounds and interactions with other models slow down the process; (2) It relies on a long context window to communicate over complicated task content; (3) Stability improvement of LLM outputs and external model services.\nAPI-Bank (Li et al. 2023) is a benchmark for evaluating the performance of tool-augmented LLMs. It contains 53 commonly used API tools, a complete tool-augmented LLM workflow, and 264 annotated dialogues that involve 568 API calls. The selection of APIs is quite diverse, including search engines, calculator, calendar queries, smart home control, schedule management, health data management, account authentication workflow and more. Because there are a large number of APIs, LLM first has access to API search engine to find the right API to call and then uses the corresponding documentation to make a call.\n\nFig. 12. Pseudo code of how LLM makes an API call in API-Bank. (Image source: Li et al. 2023)\nIn the API-Bank workflow, LLMs need to make a couple of decisions and at each step we can evaluate how accurate that decision is. Decisions include:\n\nWhether an API call is needed.\nIdentify the right API to call: if not good enough, LLMs need to iteratively modify the API inputs (e.g. deciding search keywords for Search Engine API).\nResponse based on the API results: the model can choose to refine and call again if results are not satisfied.\n\nThis benchmark evaluates the agent’s tool use capabilities at three levels:\n\nLevel-1 evaluates the ability to call the API. Given an API’s description, the model needs to determine whether to call a given API, call it correctly, and respond properly to API returns.\nLevel-2 examines the ability to retrieve the API. The model needs to search for possible APIs that may solve the user’s requirement and learn how to use them by reading documentation.\nLevel-3 assesses the ability to plan API beyond retrieve and call. Given unclear user requests (e.g. schedule group meetings, book flight/hotel/restaurant for a trip), the model may have to conduct multiple API calls to solve it.\n\nCase Studies#\nScientific Discovery Agent#\nChemCrow (Bran et al. 2023) is a domain-specific example in which LLM is augmented with 13 expert-designed tools to accomplish tasks across organic synthesis, drug discovery, and materials design. The workflow, implemented in LangChain, reflects what was previously described in the ReAct and MRKLs and combines CoT reasoning with tools relevant to the tasks:\n\nThe LLM is provided with a list of tool names, descriptions of their utility, and details about the expected input/output.\nIt is then instructed to answer a user-given prompt using the tools provided when necessary. The instruction suggests the model to follow the ReAct format - Thought, Action, Action Input, Observation."]}, 'rets': 0.7, 'error': None, 'perf': {'start_time': '2024-07-03T05:51:11.587395', 'end_time': '2024-07-03T05:51:11.962103'}, 'pid': 54862, 'tid': 6435958}, {'call_id': '1d661575-b055-4a76-a521-93d75e128e7b', 'stack': [{'path': 'app', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first', 'method': {'obj': {'cls': {'name': 'RunnableParallel', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 13726547408, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context.first', 'method': {'obj': {'cls': {'name': 'WithFeedbackFilterDocuments', 'module': {'package_name': 'trulens_eval.guardrails', 'module_name': 'trulens_eval.guardrails.langchain'}, 'bases': None}, 'id': 13728093840, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context.first', 'method': {'obj': {'cls': {'name': 'WithFeedbackFilterDocuments', 'module': {'package_name': 'trulens_eval.guardrails', 'module_name': 'trulens_eval.guardrails.langchain'}, 'bases': None}, 'id': 13728093840, 'init_bindings': None}, 'name': '_get_relevant_documents'}}, {'path': 'app.first.steps__.context.first.feedback', 'method': {'obj': {'cls': {'name': 'Feedback', 'module': {'package_name': 'trulens_eval.feedback', 'module_name': 'trulens_eval.feedback.feedback'}, 'bases': None}, 'id': 15142675120, 'init_bindings': None}, 'name': '__call__'}}], 'args': {'args': ['What is Task Decomposition?', 'Fig. 6. Illustration of how Algorithm Distillation (AD) works. (Image source: Laskin et al. 2023).\nThe paper hypothesizes that any algorithm that generates a set of learning histories can be distilled into a neural network by performing behavioral cloning over actions. The history data is generated by a set of source policies, each trained for a specific task. At the training stage, during each RL run, a random task is sampled and a subsequence of multi-episode history is used for training, such that the learned policy is task-agnostic.\nIn reality, the model has limited context window length, so episodes should be short enough to construct multi-episode history. Multi-episodic contexts of 2-4 episodes are necessary to learn a near-optimal in-context RL algorithm. The emergence of in-context RL requires long enough context.\nIn comparison with three baselines, including ED (expert distillation, behavior cloning with expert trajectories instead of learning history), source policy (used for generating trajectories for distillation by UCB), RL^2 (Duan et al. 2017; used as upper bound since it needs online RL), AD demonstrates in-context RL with performance getting close to RL^2 despite only using offline RL and learns much faster than other baselines. When conditioned on partial training history of the source policy, AD also improves much faster than ED baseline.\n\nFig. 7. Comparison of AD, ED, source policy and RL^2 on environments that require memory and exploration. Only binary reward is assigned. The source policies are trained with A3C for "dark" environments and DQN for watermaze.(Image source: Laskin et al. 2023)\nComponent Two: Memory#\n(Big thank you to ChatGPT for helping me draft this section. I’ve learned a lot about the human brain and data structure for fast MIPS in my conversations with ChatGPT.)\nTypes of Memory#\nMemory can be defined as the processes used to acquire, store, retain, and later retrieve information. There are several types of memory in human brains.\n\n\nSensory Memory: This is the earliest stage of memory, providing the ability to retain impressions of sensory information (visual, auditory, etc) after the original stimuli have ended. Sensory memory typically only lasts for up to a few seconds. Subcategories include iconic memory (visual), echoic memory (auditory), and haptic memory (touch).\n\n\nShort-Term Memory (STM) or Working Memory: It stores information that we are currently aware of and needed to carry out complex cognitive tasks such as learning and reasoning. Short-term memory is believed to have the capacity of about 7 items (Miller 1956) and lasts for 20-30 seconds.\n\n\nLong-Term Memory (LTM): Long-term memory can store information for a remarkably long time, ranging from a few days to decades, with an essentially unlimited storage capacity. There are two subtypes of LTM:\n\nExplicit / declarative memory: This is memory of facts and events, and refers to those memories that can be consciously recalled, including episodic memory (events and experiences) and semantic memory (facts and concepts).\nImplicit / procedural memory: This type of memory is unconscious and involves skills and routines that are performed automatically, like riding a bike or typing on a keyboard.\n\n\n\n\nFig. 8. Categorization of human memory.\nWe can roughly consider the following mappings:\n\nSensory memory as learning embedding representations for raw inputs, including text, image or other modalities;\nShort-term memory as in-context learning. It is short and finite, as it is restricted by the finite context window length of Transformer.\nLong-term memory as the external vector store that the agent can attend to at query time, accessible via fast retrieval.']}, 'rets': 0.7, 'error': None, 'perf': {'start_time': '2024-07-03T05:51:11.588874', 'end_time': '2024-07-03T05:51:12.096782'}, 'pid': 54862, 'tid': 6435959}, {'call_id': '116aef8b-819d-4c8f-97ef-7c7cf9bbf679', 'stack': [{'path': 'app', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first', 'method': {'obj': {'cls': {'name': 'RunnableParallel', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 13726547408, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context.first', 'method': {'obj': {'cls': {'name': 'WithFeedbackFilterDocuments', 'module': {'package_name': 'trulens_eval.guardrails', 'module_name': 'trulens_eval.guardrails.langchain'}, 'bases': None}, 'id': 13728093840, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context.first', 'method': {'obj': {'cls': {'name': 'WithFeedbackFilterDocuments', 'module': {'package_name': 'trulens_eval.guardrails', 'module_name': 'trulens_eval.guardrails.langchain'}, 'bases': None}, 'id': 13728093840, 'init_bindings': None}, 'name': '_get_relevant_documents'}}], 'args': {'query': 'What is Task Decomposition?', 'run_manager': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManagerForRetrieverRun', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13728753936, 'init_bindings': None}}}, 'rets': [{'page_content': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}], 'error': None, 'perf': {'start_time': '2024-07-03T05:51:11.344589', 'end_time': '2024-07-03T05:51:12.097110'}, 'pid': 54862, 'tid': 6435953}, {'call_id': 'd142197e-7eb5-4f9a-903b-709ee271b41a', 'stack': [{'path': 'app', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first', 'method': {'obj': {'cls': {'name': 'RunnableParallel', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 13726547408, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context.first', 'method': {'obj': {'cls': {'name': 'WithFeedbackFilterDocuments', 'module': {'package_name': 'trulens_eval.guardrails', 'module_name': 'trulens_eval.guardrails.langchain'}, 'bases': None}, 'id': 13728093840, 'init_bindings': None}, 'name': 'invoke'}}], 'args': {'input': 'What is Task Decomposition?', 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 14945277584, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, 'rets': [{'page_content': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}], 'error': None, 'perf': {'start_time': '2024-07-03T05:51:11.323578', 'end_time': '2024-07-03T05:51:12.097315'}, 'pid': 54862, 'tid': 6435953}, {'call_id': 'a41025a7-e33a-41a3-8fb2-c5750cad6820', 'stack': [{'path': 'app', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first', 'method': {'obj': {'cls': {'name': 'RunnableParallel', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 13726547408, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first.steps__.context', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}], 'args': {'input': 'What is Task Decomposition?', 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 15143364304, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, 'rets': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'error': None, 'perf': {'start_time': '2024-07-03T05:51:11.298906', 'end_time': '2024-07-03T05:51:12.097865'}, 'pid': 54862, 'tid': 6435953}, {'call_id': 'fa15611c-5c96-42ed-a9bb-ab7128b4598d', 'stack': [{'path': 'app', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.first', 'method': {'obj': {'cls': {'name': 'RunnableParallel', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 13726547408, 'init_bindings': None}, 'name': 'invoke'}}], 'args': {'input': 'What is Task Decomposition?', 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13728000272, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, 'rets': {'context': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'question': 'What is Task Decomposition?'}, 'error': None, 'perf': {'start_time': '2024-07-03T05:51:11.241491', 'end_time': '2024-07-03T05:51:12.098091'}, 'pid': 54862, 'tid': 6435330}, {'call_id': 'b917c04f-9f13-4771-8c83-ea5bcd954f73', 'stack': [{'path': 'app', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.middle[0]', 'method': {'obj': {'cls': {'name': 'ChatPromptTemplate', 'module': {'package_name': 'langchain_core.prompts', 'module_name': 'langchain_core.prompts.chat'}, 'bases': None}, 'id': 15129453584, 'init_bindings': None}, 'name': 'invoke'}}], 'args': {'input': {'context': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'question': 'What is Task Decomposition?'}, 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13726364240, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, 'rets': {'messages': [{'content': 'You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don\'t know the answer, just say that you don\'t know. Use three sentences maximum and keep the answer concise.\nQuestion: What is Task Decomposition? \nContext: Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results. \nAnswer:', 'additional_kwargs': {}, 'response_metadata': {}, 'type': 'human', 'name': None, 'id': None, 'example': False}]}, 'error': None, 'perf': {'start_time': '2024-07-03T05:51:12.178202', 'end_time': '2024-07-03T05:51:12.215342'}, 'pid': 54862, 'tid': 6435330}, {'call_id': '72a5680b-5c89-47ba-8e2d-2c00bf28bb46', 'stack': [{'path': 'app', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.middle[1]', 'method': {'obj': {'cls': {'name': 'ChatOpenAI', 'module': {'package_name': 'langchain_community.chat_models', 'module_name': 'langchain_community.chat_models.openai'}, 'bases': None}, 'id': 15122144848, 'init_bindings': None}, 'name': 'invoke'}}], 'args': {'input': {'messages': [{'content': 'You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don\'t know the answer, just say that you don\'t know. Use three sentences maximum and keep the answer concise.\nQuestion: What is Task Decomposition? \nContext: Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results. \nAnswer:', 'additional_kwargs': {}, 'response_metadata': {}, 'type': 'human', 'name': None, 'id': None, 'example': False}]}, 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 15145143184, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, 'rets': {'content': 'Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', 'additional_kwargs': {}, 'response_metadata': {'token_usage': {'completion_tokens': 65, 'prompt_tokens': 836, 'total_tokens': 901}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, 'type': 'ai', 'name': None, 'id': 'run-05f3d6ee-fc3a-4a3f-aaf1-812eb7375cb9-0', 'example': False, 'tool_calls': [], 'invalid_tool_calls': [], 'usage_metadata': None}, 'error': None, 'perf': {'start_time': '2024-07-03T05:51:12.310321', 'end_time': '2024-07-03T05:51:13.631697'}, 'pid': 54862, 'tid': 6435330}, {'call_id': '7b1114dd-b7f8-4de4-993f-06dda4194d33', 'stack': [{'path': 'app', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}, {'path': 'app.last', 'method': {'obj': {'cls': {'name': 'StrOutputParser', 'module': {'package_name': 'langchain_core.output_parsers', 'module_name': 'langchain_core.output_parsers.string'}, 'bases': None}, 'id': 13724810128, 'init_bindings': None}, 'name': 'invoke'}}], 'args': {'input': {'content': 'Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', 'additional_kwargs': {}, 'response_metadata': {'token_usage': {'completion_tokens': 65, 'prompt_tokens': 836, 'total_tokens': 901}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, 'type': 'ai', 'name': None, 'id': 'run-05f3d6ee-fc3a-4a3f-aaf1-812eb7375cb9-0', 'example': False, 'tool_calls': [], 'invalid_tool_calls': [], 'usage_metadata': None}, 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13727989648, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, 'rets': 'Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', 'error': None, 'perf': {'start_time': '2024-07-03T05:51:13.690211', 'end_time': '2024-07-03T05:51:13.718171'}, 'pid': 54862, 'tid': 6435330}, {'call_id': 'c74b4da5-6555-4647-ae00-115a614e701d', 'stack': [{'path': 'app', 'method': {'obj': {'cls': {'name': 'RunnableSequence', 'module': {'package_name': 'langchain_core.runnables', 'module_name': 'langchain_core.runnables.base'}, 'bases': None}, 'id': 6209315280, 'init_bindings': None}, 'name': 'invoke'}}], 'args': {'input': 'What is Task Decomposition?'}, 'rets': 'Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', 'error': None, 'perf': {'start_time': '2024-07-03T05:51:11.168333', 'end_time': '2024-07-03T05:51:13.718205'}, 'pid': 54862, 'tid': 6435330}], 'app': {'first': {'steps__': {'question': {'invoke': [RecordAppCall(call_id='201a07bb-6b2f-4616-ae75-0320ec1bc97b', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.question, method=Method(obj=Obj(cls=langchain_core.runnables.passthrough.RunnablePassthrough, id=6209785744, init_bindings=None), name='invoke'))], args={'input': 'What is Task Decomposition?', 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13729026896, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, rets='What is Task Decomposition?', error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 320329), end_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 340951)), pid=54862, tid=6435954)]}, 'context': {'first': {'_get_relevant_documents': [RecordAppCall(call_id='213e417f-c348-4f97-aae7-7be4a78594ba', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='_get_relevant_documents')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=langchain_core.vectorstores.VectorStoreRetriever, id=13728093840, init_bindings=None), name='_get_relevant_documents'))], args={'query': 'What is Task Decomposition?', 'run_manager': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManagerForRetrieverRun', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13728753936, 'init_bindings': None}}}, rets=[{'page_content': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}, {'page_content': 'Fig. 10. A picture of a sea otter using rock to crack open a seashell, while floating in the water. While some other animals can use tools, the complexity is not comparable with humans. (Image source: Animals using tools)\nMRKL (Karpas et al. 2022), short for “Modular Reasoning, Knowledge and Language”, is a neuro-symbolic architecture for autonomous agents. A MRKL system is proposed to contain a collection of “expert” modules and the general-purpose LLM works as a router to route inquiries to the best suitable expert module. These modules can be neural (e.g. deep learning models) or symbolic (e.g. math calculator, currency converter, weather API).\nThey did an experiment on fine-tuning LLM to call a calculator, using arithmetic as a test case. Their experiments showed that it was harder to solve verbal math problems than explicitly stated math problems because LLMs (7B Jurassic1-large model) failed to extract the right arguments for the basic arithmetic reliably. The results highlight when the external symbolic tools can work reliably, knowing when to and how to use the tools are crucial, determined by the LLM capability.\nBoth TALM (Tool Augmented Language Models; Parisi et al. 2022) and Toolformer (Schick et al. 2023) fine-tune a LM to learn to use external tool APIs. The dataset is expanded based on whether a newly added API call annotation can improve the quality of model outputs. See more details in the “External APIs” section of Prompt Engineering.\nChatGPT Plugins and OpenAI API function calling are good examples of LLMs augmented with tool use capability working in practice. The collection of tool APIs can be provided by other developers (as in Plugins) or self-defined (as in function calls).\nHuggingGPT (Shen et al. 2023) is a framework to use ChatGPT as the task planner to select models available in HuggingFace platform according to the model descriptions and summarize the response based on the execution results.\n\nFig. 11. Illustration of how HuggingGPT works. (Image source: Shen et al. 2023)\nThe system comprises of 4 stages:\n(1) Task planning: LLM works as the brain and parses the user requests into multiple tasks. There are four attributes associated with each task: task type, ID, dependencies, and arguments. They use few-shot examples to guide LLM to do task parsing and planning.\nInstruction:\n\nThe AI assistant can parse user input to several tasks: [{"task": task, "id", task_id, "dep": dependency_task_ids, "args": {"text": text, "image": URL, "audio": URL, "video": URL}}]. The "dep" field denotes the id of the previous task which generates a new resource that the current task relies on. A special tag "-task_id" refers to the generated text image, audio and video in the dependency task with id as task_id. The task MUST be selected from the following options: {{ Available Task List }}. There is a logical relationship between tasks, please note their order. If the user input can\'t be parsed, you need to reply empty JSON. Here are several cases for your reference: {{ Demonstrations }}. The chat history is recorded as {{ Chat History }}. From this chat history, you can find the path of the user-mentioned resources for your task planning.\n\n(2) Model selection: LLM distributes the tasks to expert models, where the request is framed as a multiple-choice question. LLM is presented with a list of models to choose from. Due to the limited context length, task type based filtration is needed.\nInstruction:\n\nGiven the user request and the call command, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The AI assistant merely outputs the model id of the most appropriate model. The output must be in a strict JSON format: "id": "id", "reason": "your detail reason for the choice". We have a list of models for you to choose from {{ Candidate Models }}. Please select one model from the list.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}, {'page_content': "(3) Task execution: Expert models execute on the specific tasks and log results.\nInstruction:\n\nWith the input and the inference results, the AI assistant needs to describe the process and results. The previous stages can be formed as - User Input: {{ User Input }}, Task Planning: {{ Tasks }}, Model Selection: {{ Model Assignment }}, Task Execution: {{ Predictions }}. You must first answer the user's request in a straightforward manner. Then describe the task process and show your analysis and model inference results to the user in the first person. If inference results contain a file path, must tell the user the complete file path.\n\n(4) Response generation: LLM receives the execution results and provides summarized results to users.\nTo put HuggingGPT into real world usage, a couple challenges need to solve: (1) Efficiency improvement is needed as both LLM inference rounds and interactions with other models slow down the process; (2) It relies on a long context window to communicate over complicated task content; (3) Stability improvement of LLM outputs and external model services.\nAPI-Bank (Li et al. 2023) is a benchmark for evaluating the performance of tool-augmented LLMs. It contains 53 commonly used API tools, a complete tool-augmented LLM workflow, and 264 annotated dialogues that involve 568 API calls. The selection of APIs is quite diverse, including search engines, calculator, calendar queries, smart home control, schedule management, health data management, account authentication workflow and more. Because there are a large number of APIs, LLM first has access to API search engine to find the right API to call and then uses the corresponding documentation to make a call.\n\nFig. 12. Pseudo code of how LLM makes an API call in API-Bank. (Image source: Li et al. 2023)\nIn the API-Bank workflow, LLMs need to make a couple of decisions and at each step we can evaluate how accurate that decision is. Decisions include:\n\nWhether an API call is needed.\nIdentify the right API to call: if not good enough, LLMs need to iteratively modify the API inputs (e.g. deciding search keywords for Search Engine API).\nResponse based on the API results: the model can choose to refine and call again if results are not satisfied.\n\nThis benchmark evaluates the agent’s tool use capabilities at three levels:\n\nLevel-1 evaluates the ability to call the API. Given an API’s description, the model needs to determine whether to call a given API, call it correctly, and respond properly to API returns.\nLevel-2 examines the ability to retrieve the API. The model needs to search for possible APIs that may solve the user’s requirement and learn how to use them by reading documentation.\nLevel-3 assesses the ability to plan API beyond retrieve and call. Given unclear user requests (e.g. schedule group meetings, book flight/hotel/restaurant for a trip), the model may have to conduct multiple API calls to solve it.\n\nCase Studies#\nScientific Discovery Agent#\nChemCrow (Bran et al. 2023) is a domain-specific example in which LLM is augmented with 13 expert-designed tools to accomplish tasks across organic synthesis, drug discovery, and materials design. The workflow, implemented in LangChain, reflects what was previously described in the ReAct and MRKLs and combines CoT reasoning with tools relevant to the tasks:\n\nThe LLM is provided with a list of tool names, descriptions of their utility, and details about the expected input/output.\nIt is then instructed to answer a user-given prompt using the tools provided when necessary. The instruction suggests the model to follow the ReAct format - Thought, Action, Action Input, Observation.", 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}, {'page_content': 'Fig. 6. Illustration of how Algorithm Distillation (AD) works. (Image source: Laskin et al. 2023).\nThe paper hypothesizes that any algorithm that generates a set of learning histories can be distilled into a neural network by performing behavioral cloning over actions. The history data is generated by a set of source policies, each trained for a specific task. At the training stage, during each RL run, a random task is sampled and a subsequence of multi-episode history is used for training, such that the learned policy is task-agnostic.\nIn reality, the model has limited context window length, so episodes should be short enough to construct multi-episode history. Multi-episodic contexts of 2-4 episodes are necessary to learn a near-optimal in-context RL algorithm. The emergence of in-context RL requires long enough context.\nIn comparison with three baselines, including ED (expert distillation, behavior cloning with expert trajectories instead of learning history), source policy (used for generating trajectories for distillation by UCB), RL^2 (Duan et al. 2017; used as upper bound since it needs online RL), AD demonstrates in-context RL with performance getting close to RL^2 despite only using offline RL and learns much faster than other baselines. When conditioned on partial training history of the source policy, AD also improves much faster than ED baseline.\n\nFig. 7. Comparison of AD, ED, source policy and RL^2 on environments that require memory and exploration. Only binary reward is assigned. The source policies are trained with A3C for "dark" environments and DQN for watermaze.(Image source: Laskin et al. 2023)\nComponent Two: Memory#\n(Big thank you to ChatGPT for helping me draft this section. I’ve learned a lot about the human brain and data structure for fast MIPS in my conversations with ChatGPT.)\nTypes of Memory#\nMemory can be defined as the processes used to acquire, store, retain, and later retrieve information. There are several types of memory in human brains.\n\n\nSensory Memory: This is the earliest stage of memory, providing the ability to retain impressions of sensory information (visual, auditory, etc) after the original stimuli have ended. Sensory memory typically only lasts for up to a few seconds. Subcategories include iconic memory (visual), echoic memory (auditory), and haptic memory (touch).\n\n\nShort-Term Memory (STM) or Working Memory: It stores information that we are currently aware of and needed to carry out complex cognitive tasks such as learning and reasoning. Short-term memory is believed to have the capacity of about 7 items (Miller 1956) and lasts for 20-30 seconds.\n\n\nLong-Term Memory (LTM): Long-term memory can store information for a remarkably long time, ranging from a few days to decades, with an essentially unlimited storage capacity. There are two subtypes of LTM:\n\nExplicit / declarative memory: This is memory of facts and events, and refers to those memories that can be consciously recalled, including episodic memory (events and experiences) and semantic memory (facts and concepts).\nImplicit / procedural memory: This type of memory is unconscious and involves skills and routines that are performed automatically, like riding a bike or typing on a keyboard.\n\n\n\n\nFig. 8. Categorization of human memory.\nWe can roughly consider the following mappings:\n\nSensory memory as learning embedding representations for raw inputs, including text, image or other modalities;\nShort-term memory as in-context learning. It is short and finite, as it is restricted by the finite context window length of Transformer.\nLong-term memory as the external vector store that the agent can attend to at query time, accessible via fast retrieval.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}], error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 359885), end_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 572243)), pid=54862, tid=6435953), RecordAppCall(call_id='116aef8b-819d-4c8f-97ef-7c7cf9bbf679', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='_get_relevant_documents'))], args={'query': 'What is Task Decomposition?', 'run_manager': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManagerForRetrieverRun', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13728753936, 'init_bindings': None}}}, rets=[{'page_content': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}], error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 344589), end_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 97110)), pid=54862, tid=6435953)], 'feedback': {'__call__': [RecordAppCall(call_id='15a79df3-d358-42b9-88c9-53ea69d4a05d', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='_get_relevant_documents')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first.feedback, method=Method(obj=Obj(cls=trulens_eval.feedback.feedback.Feedback, id=15142675120, init_bindings=None), name='__call__'))], args={'args': ['What is Task Decomposition?', 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.']}, rets=0.8, error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 579728), end_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 910319)), pid=54862, tid=6435956), RecordAppCall(call_id='97f870cf-22d6-4b09-8c35-3f8eeeb89272', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='_get_relevant_documents')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first.feedback, method=Method(obj=Obj(cls=trulens_eval.feedback.feedback.Feedback, id=15142675120, init_bindings=None), name='__call__'))], args={'args': ['What is Task Decomposition?', 'Fig. 10. A picture of a sea otter using rock to crack open a seashell, while floating in the water. While some other animals can use tools, the complexity is not comparable with humans. (Image source: Animals using tools)\nMRKL (Karpas et al. 2022), short for “Modular Reasoning, Knowledge and Language”, is a neuro-symbolic architecture for autonomous agents. A MRKL system is proposed to contain a collection of “expert” modules and the general-purpose LLM works as a router to route inquiries to the best suitable expert module. These modules can be neural (e.g. deep learning models) or symbolic (e.g. math calculator, currency converter, weather API).\nThey did an experiment on fine-tuning LLM to call a calculator, using arithmetic as a test case. Their experiments showed that it was harder to solve verbal math problems than explicitly stated math problems because LLMs (7B Jurassic1-large model) failed to extract the right arguments for the basic arithmetic reliably. The results highlight when the external symbolic tools can work reliably, knowing when to and how to use the tools are crucial, determined by the LLM capability.\nBoth TALM (Tool Augmented Language Models; Parisi et al. 2022) and Toolformer (Schick et al. 2023) fine-tune a LM to learn to use external tool APIs. The dataset is expanded based on whether a newly added API call annotation can improve the quality of model outputs. See more details in the “External APIs” section of Prompt Engineering.\nChatGPT Plugins and OpenAI API function calling are good examples of LLMs augmented with tool use capability working in practice. The collection of tool APIs can be provided by other developers (as in Plugins) or self-defined (as in function calls).\nHuggingGPT (Shen et al. 2023) is a framework to use ChatGPT as the task planner to select models available in HuggingFace platform according to the model descriptions and summarize the response based on the execution results.\n\nFig. 11. Illustration of how HuggingGPT works. (Image source: Shen et al. 2023)\nThe system comprises of 4 stages:\n(1) Task planning: LLM works as the brain and parses the user requests into multiple tasks. There are four attributes associated with each task: task type, ID, dependencies, and arguments. They use few-shot examples to guide LLM to do task parsing and planning.\nInstruction:\n\nThe AI assistant can parse user input to several tasks: [{"task": task, "id", task_id, "dep": dependency_task_ids, "args": {"text": text, "image": URL, "audio": URL, "video": URL}}]. The "dep" field denotes the id of the previous task which generates a new resource that the current task relies on. A special tag "-task_id" refers to the generated text image, audio and video in the dependency task with id as task_id. The task MUST be selected from the following options: {{ Available Task List }}. There is a logical relationship between tasks, please note their order. If the user input can\'t be parsed, you need to reply empty JSON. Here are several cases for your reference: {{ Demonstrations }}. The chat history is recorded as {{ Chat History }}. From this chat history, you can find the path of the user-mentioned resources for your task planning.\n\n(2) Model selection: LLM distributes the tasks to expert models, where the request is framed as a multiple-choice question. LLM is presented with a list of models to choose from. Due to the limited context length, task type based filtration is needed.\nInstruction:\n\nGiven the user request and the call command, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The AI assistant merely outputs the model id of the most appropriate model. The output must be in a strict JSON format: "id": "id", "reason": "your detail reason for the choice". We have a list of models for you to choose from {{ Candidate Models }}. Please select one model from the list.']}, rets=0.4, error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 583334), end_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 926111)), pid=54862, tid=6435957), RecordAppCall(call_id='ba9d8945-aedf-450d-b117-11e076671be5', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='_get_relevant_documents')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first.feedback, method=Method(obj=Obj(cls=trulens_eval.feedback.feedback.Feedback, id=15142675120, init_bindings=None), name='__call__'))], args={'args': ['What is Task Decomposition?', "(3) Task execution: Expert models execute on the specific tasks and log results.\nInstruction:\n\nWith the input and the inference results, the AI assistant needs to describe the process and results. The previous stages can be formed as - User Input: {{ User Input }}, Task Planning: {{ Tasks }}, Model Selection: {{ Model Assignment }}, Task Execution: {{ Predictions }}. You must first answer the user's request in a straightforward manner. Then describe the task process and show your analysis and model inference results to the user in the first person. If inference results contain a file path, must tell the user the complete file path.\n\n(4) Response generation: LLM receives the execution results and provides summarized results to users.\nTo put HuggingGPT into real world usage, a couple challenges need to solve: (1) Efficiency improvement is needed as both LLM inference rounds and interactions with other models slow down the process; (2) It relies on a long context window to communicate over complicated task content; (3) Stability improvement of LLM outputs and external model services.\nAPI-Bank (Li et al. 2023) is a benchmark for evaluating the performance of tool-augmented LLMs. It contains 53 commonly used API tools, a complete tool-augmented LLM workflow, and 264 annotated dialogues that involve 568 API calls. The selection of APIs is quite diverse, including search engines, calculator, calendar queries, smart home control, schedule management, health data management, account authentication workflow and more. Because there are a large number of APIs, LLM first has access to API search engine to find the right API to call and then uses the corresponding documentation to make a call.\n\nFig. 12. Pseudo code of how LLM makes an API call in API-Bank. (Image source: Li et al. 2023)\nIn the API-Bank workflow, LLMs need to make a couple of decisions and at each step we can evaluate how accurate that decision is. Decisions include:\n\nWhether an API call is needed.\nIdentify the right API to call: if not good enough, LLMs need to iteratively modify the API inputs (e.g. deciding search keywords for Search Engine API).\nResponse based on the API results: the model can choose to refine and call again if results are not satisfied.\n\nThis benchmark evaluates the agent’s tool use capabilities at three levels:\n\nLevel-1 evaluates the ability to call the API. Given an API’s description, the model needs to determine whether to call a given API, call it correctly, and respond properly to API returns.\nLevel-2 examines the ability to retrieve the API. The model needs to search for possible APIs that may solve the user’s requirement and learn how to use them by reading documentation.\nLevel-3 assesses the ability to plan API beyond retrieve and call. Given unclear user requests (e.g. schedule group meetings, book flight/hotel/restaurant for a trip), the model may have to conduct multiple API calls to solve it.\n\nCase Studies#\nScientific Discovery Agent#\nChemCrow (Bran et al. 2023) is a domain-specific example in which LLM is augmented with 13 expert-designed tools to accomplish tasks across organic synthesis, drug discovery, and materials design. The workflow, implemented in LangChain, reflects what was previously described in the ReAct and MRKLs and combines CoT reasoning with tools relevant to the tasks:\n\nThe LLM is provided with a list of tool names, descriptions of their utility, and details about the expected input/output.\nIt is then instructed to answer a user-given prompt using the tools provided when necessary. The instruction suggests the model to follow the ReAct format - Thought, Action, Action Input, Observation."]}, rets=0.7, error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 587395), end_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 962103)), pid=54862, tid=6435958), RecordAppCall(call_id='1d661575-b055-4a76-a521-93d75e128e7b', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='_get_relevant_documents')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first.feedback, method=Method(obj=Obj(cls=trulens_eval.feedback.feedback.Feedback, id=15142675120, init_bindings=None), name='__call__'))], args={'args': ['What is Task Decomposition?', 'Fig. 6. Illustration of how Algorithm Distillation (AD) works. (Image source: Laskin et al. 2023).\nThe paper hypothesizes that any algorithm that generates a set of learning histories can be distilled into a neural network by performing behavioral cloning over actions. The history data is generated by a set of source policies, each trained for a specific task. At the training stage, during each RL run, a random task is sampled and a subsequence of multi-episode history is used for training, such that the learned policy is task-agnostic.\nIn reality, the model has limited context window length, so episodes should be short enough to construct multi-episode history. Multi-episodic contexts of 2-4 episodes are necessary to learn a near-optimal in-context RL algorithm. The emergence of in-context RL requires long enough context.\nIn comparison with three baselines, including ED (expert distillation, behavior cloning with expert trajectories instead of learning history), source policy (used for generating trajectories for distillation by UCB), RL^2 (Duan et al. 2017; used as upper bound since it needs online RL), AD demonstrates in-context RL with performance getting close to RL^2 despite only using offline RL and learns much faster than other baselines. When conditioned on partial training history of the source policy, AD also improves much faster than ED baseline.\n\nFig. 7. Comparison of AD, ED, source policy and RL^2 on environments that require memory and exploration. Only binary reward is assigned. The source policies are trained with A3C for "dark" environments and DQN for watermaze.(Image source: Laskin et al. 2023)\nComponent Two: Memory#\n(Big thank you to ChatGPT for helping me draft this section. I’ve learned a lot about the human brain and data structure for fast MIPS in my conversations with ChatGPT.)\nTypes of Memory#\nMemory can be defined as the processes used to acquire, store, retain, and later retrieve information. There are several types of memory in human brains.\n\n\nSensory Memory: This is the earliest stage of memory, providing the ability to retain impressions of sensory information (visual, auditory, etc) after the original stimuli have ended. Sensory memory typically only lasts for up to a few seconds. Subcategories include iconic memory (visual), echoic memory (auditory), and haptic memory (touch).\n\n\nShort-Term Memory (STM) or Working Memory: It stores information that we are currently aware of and needed to carry out complex cognitive tasks such as learning and reasoning. Short-term memory is believed to have the capacity of about 7 items (Miller 1956) and lasts for 20-30 seconds.\n\n\nLong-Term Memory (LTM): Long-term memory can store information for a remarkably long time, ranging from a few days to decades, with an essentially unlimited storage capacity. There are two subtypes of LTM:\n\nExplicit / declarative memory: This is memory of facts and events, and refers to those memories that can be consciously recalled, including episodic memory (events and experiences) and semantic memory (facts and concepts).\nImplicit / procedural memory: This type of memory is unconscious and involves skills and routines that are performed automatically, like riding a bike or typing on a keyboard.\n\n\n\n\nFig. 8. Categorization of human memory.\nWe can roughly consider the following mappings:\n\nSensory memory as learning embedding representations for raw inputs, including text, image or other modalities;\nShort-term memory as in-context learning. It is short and finite, as it is restricted by the finite context window length of Transformer.\nLong-term memory as the external vector store that the agent can attend to at query time, accessible via fast retrieval.']}, rets=0.7, error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 588874), end_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 96782)), pid=54862, tid=6435959)]}, 'invoke': [RecordAppCall(call_id='d142197e-7eb5-4f9a-903b-709ee271b41a', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context.first, method=Method(obj=Obj(cls=trulens_eval.guardrails.langchain.WithFeedbackFilterDocuments, id=13728093840, init_bindings=None), name='invoke'))], args={'input': 'What is Task Decomposition?', 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 14945277584, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, rets=[{'page_content': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'metadata': {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, 'type': 'Document'}], error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 323578), end_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 97315)), pid=54862, tid=6435953)]}, 'invoke': [RecordAppCall(call_id='a41025a7-e33a-41a3-8fb2-c5750cad6820', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first.steps__.context, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke'))], args={'input': 'What is Task Decomposition?', 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 15143364304, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, rets='Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 298906), end_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 97865)), pid=54862, tid=6435953)]}}, 'invoke': [RecordAppCall(call_id='fa15611c-5c96-42ed-a9bb-ab7128b4598d', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.first, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableParallel, id=13726547408, init_bindings=None), name='invoke'))], args={'input': 'What is Task Decomposition?', 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13728000272, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, rets={'context': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'question': 'What is Task Decomposition?'}, error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 241491), end_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 98091)), pid=54862, tid=6435330)]}, 'middle': [{'invoke': [RecordAppCall(call_id='b917c04f-9f13-4771-8c83-ea5bcd954f73', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.middle[0], method=Method(obj=Obj(cls=langchain_core.prompts.chat.ChatPromptTemplate, id=15129453584, init_bindings=None), name='invoke'))], args={'input': {'context': 'Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results.', 'question': 'What is Task Decomposition?'}, 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13726364240, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, rets={'messages': [{'content': 'You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don\'t know the answer, just say that you don\'t know. Use three sentences maximum and keep the answer concise.\nQuestion: What is Task Decomposition? \nContext: Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results. \nAnswer:', 'additional_kwargs': {}, 'response_metadata': {}, 'type': 'human', 'name': None, 'id': None, 'example': False}]}, error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 178202), end_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 215342)), pid=54862, tid=6435330)]}, {'invoke': [RecordAppCall(call_id='72a5680b-5c89-47ba-8e2d-2c00bf28bb46', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.middle[1], method=Method(obj=Obj(cls=langchain_community.chat_models.openai.ChatOpenAI, id=15122144848, init_bindings=None), name='invoke'))], args={'input': {'messages': [{'content': 'You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don\'t know the answer, just say that you don\'t know. Use three sentences maximum and keep the answer concise.\nQuestion: What is Task Decomposition? \nContext: Fig. 1. Overview of a LLM-powered autonomous agent system.\nComponent One: Planning#\nA complicated task usually involves many steps. An agent needs to know what they are and plan ahead.\nTask Decomposition#\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for enhancing model performance on complex tasks. The model is instructed to “think step by step” to utilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT transforms big tasks into multiple manageable tasks and shed lights into an interpretation of the model’s thinking process.\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.\nTask decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.\nAnother quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition Language (PDDL) as an intermediate interface to describe the planning problem. In this process, LLM (1) translates the problem into “Problem PDDL”, then (2) requests a classical planner to generate a PDDL plan based on an existing “Domain PDDL”, and finally (3) translates the PDDL plan back into natural language. Essentially, the planning step is outsourced to an external tool, assuming the availability of domain-specific PDDL and a suitable planner which is common in certain robotic setups but not in many other domains.\nSelf-Reflection#\nSelf-reflection is a vital aspect that allows autonomous agents to improve iteratively by refining past action decisions and correcting previous mistakes. It plays a crucial role in real-world tasks where trial and error are inevitable.\nReAct (Yao et al. 2023) integrates reasoning and acting within LLM by extending the action space to be a combination of task-specific discrete actions and the language space. The former enables LLM to interact with the environment (e.g. use Wikipedia search API), while the latter prompting LLM to generate reasoning traces in natural language.\nThe ReAct prompt template incorporates explicit steps for LLM to think, roughly formatted as:\nThought: ...\nAction: ...\nObservation: ...\n... (Repeated many times)\n\nFig. 2. Examples of reasoning trajectories for knowledge-intensive tasks (e.g. HotpotQA, FEVER) and decision-making tasks (e.g. AlfWorld Env, WebShop). (Image source: Yao et al. 2023).\nIn both experiments on knowledge-intensive tasks and decision-making tasks, ReAct works better than the Act-only baseline where Thought: … step is removed.\nReflexion (Shinn & Labash 2023) is a framework to equips agents with dynamic memory and self-reflection capabilities to improve reasoning skills. Reflexion has a standard RL setup, in which the reward model provides a simple binary reward and the action space follows the setup in ReAct where the task-specific action space is augmented with language to enable complex reasoning steps. After each action $a_t$, the agent computes a heuristic $h_t$ and optionally may decide to reset the environment to start a new trial depending on the self-reflection results. \nAnswer:', 'additional_kwargs': {}, 'response_metadata': {}, 'type': 'human', 'name': None, 'id': None, 'example': False}]}, 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 15145143184, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, rets={'content': 'Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', 'additional_kwargs': {}, 'response_metadata': {'token_usage': {'completion_tokens': 65, 'prompt_tokens': 836, 'total_tokens': 901}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, 'type': 'ai', 'name': None, 'id': 'run-05f3d6ee-fc3a-4a3f-aaf1-812eb7375cb9-0', 'example': False, 'tool_calls': [], 'invalid_tool_calls': [], 'usage_metadata': None}, error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 12, 310321), end_time=datetime.datetime(2024, 7, 3, 5, 51, 13, 631697)), pid=54862, tid=6435330)]}], 'last': {'invoke': [RecordAppCall(call_id='7b1114dd-b7f8-4de4-993f-06dda4194d33', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke')), RecordAppCallMethod(path=Lens().app.last, method=Method(obj=Obj(cls=langchain_core.output_parsers.string.StrOutputParser, id=13724810128, init_bindings=None), name='invoke'))], args={'input': {'content': 'Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', 'additional_kwargs': {}, 'response_metadata': {'token_usage': {'completion_tokens': 65, 'prompt_tokens': 836, 'total_tokens': 901}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, 'type': 'ai', 'name': None, 'id': 'run-05f3d6ee-fc3a-4a3f-aaf1-812eb7375cb9-0', 'example': False, 'tool_calls': [], 'invalid_tool_calls': [], 'usage_metadata': None}, 'config': {'tags': [], 'metadata': {}, 'callbacks': {'__tru_non_serialized_object': {'cls': {'name': 'CallbackManager', 'module': {'package_name': 'langchain_core.callbacks', 'module_name': 'langchain_core.callbacks.manager'}, 'bases': None}, 'id': 13727989648, 'init_bindings': None}}, 'recursion_limit': 25, 'configurable': {}}}, rets='Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 13, 690211), end_time=datetime.datetime(2024, 7, 3, 5, 51, 13, 718171)), pid=54862, tid=6435330)]}, 'invoke': [RecordAppCall(call_id='c74b4da5-6555-4647-ae00-115a614e701d', stack=[RecordAppCallMethod(path=Lens().app, method=Method(obj=Obj(cls=langchain_core.runnables.base.RunnableSequence, id=6209315280, init_bindings=None), name='invoke'))], args={'input': 'What is Task Decomposition?'}, rets='Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be achieved through prompting techniques like Chain of Thought or Tree of Thoughts, or by using task-specific instructions. Task decomposition can also involve outsourcing the planning step to an external classical planner, as seen in the LLM+P approach.', error=None, perf=Perf(start_time=datetime.datetime(2024, 7, 3, 5, 51, 11, 168333), end_time=datetime.datetime(2024, 7, 3, 5, 51, 13, 718205)), pid=54862, tid=6435330)]}})
from ipytree import Tree, Node
def display_call_stack(data):
tree = Tree()
tree.add_node(Node('Record ID: {}'.format(data['record_id'])))
tree.add_node(Node('App ID: {}'.format(data['app_id'])))
tree.add_node(Node('Cost: {}'.format(data['cost'])))
tree.add_node(Node('Performance: {}'.format(data['perf'])))
tree.add_node(Node('Timestamp: {}'.format(data['ts'])))
tree.add_node(Node('Tags: {}'.format(data['tags'])))
tree.add_node(Node('Main Input: {}'.format(data['main_input'])))
tree.add_node(Node('Main Output: {}'.format(data['main_output'])))
tree.add_node(Node('Main Error: {}'.format(data['main_error'])))
calls_node = Node('Calls')
tree.add_node(calls_node)
for call in data['calls']:
call_node = Node('Call')
calls_node.add_node(call_node)
for step in call['stack']:
step_node = Node('Step: {}'.format(step['path']))
call_node.add_node(step_node)
if 'expanded' in step:
expanded_node = Node('Expanded')
step_node.add_node(expanded_node)
for expanded_step in step['expanded']:
expanded_step_node = Node('Step: {}'.format(expanded_step['path']))
expanded_node.add_node(expanded_step_node)
return tree
# Usage
tree = display_call_stack(json_like)
tree
Tree(nodes=(Node(name='Record ID: record_hash_b58260de958b6a44f38e92067bd75220'), Node(name='App ID: Chain1_Ch…