Iterating on LLM Apps with TruLens¶
In this example, we will build a first prototype RAG to answer questions from the Insurance Handbook PDF. Using TruLens, we will identify early failure modes, and then iterate to ensure the app is honest, harmless and helpful.
In [ ]:
Copied!
# !pip install trulens trulens-apps-llamaindex trulens-providers-openai langchain llama_index llama-index-llms-openai llama_hub llmsherpa
# !pip install trulens trulens-apps-llamaindex trulens-providers-openai langchain llama_index llama-index-llms-openai llama_hub llmsherpa
In [ ]:
Copied!
# Set your API keys. If you already have them in your var env., you can skip these steps.
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
# Set your API keys. If you already have them in your var env., you can skip these steps.
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
In [ ]:
Copied!
from trulens.core import TruSession
session = TruSession()
from trulens.core import TruSession
session = TruSession()
In [ ]:
Copied!
from trulens.dashboard import run_dashboard
run_dashboard(session)
from trulens.dashboard import run_dashboard
run_dashboard(session)
Start with basic RAG.¶
In [ ]:
Copied!
from llama_hub.smart_pdf_loader import SmartPDFLoader
llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)
documents = pdf_loader.load_data(
"https://www.iii.org/sites/default/files/docs/pdf/Insurance_Handbook_20103.pdf"
)
from llama_hub.smart_pdf_loader import SmartPDFLoader
llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)
documents = pdf_loader.load_data(
"https://www.iii.org/sites/default/files/docs/pdf/Insurance_Handbook_20103.pdf"
)
In [ ]:
Copied!
from llama_index import Prompt
from llama_index.core import Document
from llama_index.core import VectorStoreIndex
from llama_index.legacy import ServiceContext
from llama_index.llms.openai import OpenAI
# initialize llm
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.5)
# knowledge store
document = Document(text="\n\n".join([doc.text for doc in documents]))
# service context for index
service_context = ServiceContext.from_defaults(
llm=llm, embed_model="local:BAAI/bge-small-en-v1.5"
)
# create index
index = VectorStoreIndex.from_documents(
[document], service_context=service_context
)
system_prompt = Prompt(
"We have provided context information below that you may use. \n"
"---------------------\n"
"{context_str}"
"\n---------------------\n"
"Please answer the question: {query_str}\n"
)
# basic rag query engine
rag_basic = index.as_query_engine(text_qa_template=system_prompt)
from llama_index import Prompt
from llama_index.core import Document
from llama_index.core import VectorStoreIndex
from llama_index.legacy import ServiceContext
from llama_index.llms.openai import OpenAI
# initialize llm
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.5)
# knowledge store
document = Document(text="\n\n".join([doc.text for doc in documents]))
# service context for index
service_context = ServiceContext.from_defaults(
llm=llm, embed_model="local:BAAI/bge-small-en-v1.5"
)
# create index
index = VectorStoreIndex.from_documents(
[document], service_context=service_context
)
system_prompt = Prompt(
"We have provided context information below that you may use. \n"
"---------------------\n"
"{context_str}"
"\n---------------------\n"
"Please answer the question: {query_str}\n"
)
# basic rag query engine
rag_basic = index.as_query_engine(text_qa_template=system_prompt)
Load test set¶
In [ ]:
Copied!
honest_evals = [
"What are the typical coverage options for homeowners insurance?",
"What are the requirements for long term care insurance to start?",
"Can annuity benefits be passed to beneficiaries?",
"Are credit scores used to set insurance premiums? If so, how?",
"Who provides flood insurance?",
"Can you get flood insurance outside high-risk areas?",
"How much in losses does fraud account for in property & casualty insurance?",
"Do pay-as-you-drive insurance policies have an impact on greenhouse gas emissions? How much?",
"What was the most costly earthquake in US history for insurers?",
"Does it matter who is at fault to be compensated when injured on the job?",
]
honest_evals = [
"What are the typical coverage options for homeowners insurance?",
"What are the requirements for long term care insurance to start?",
"Can annuity benefits be passed to beneficiaries?",
"Are credit scores used to set insurance premiums? If so, how?",
"Who provides flood insurance?",
"Can you get flood insurance outside high-risk areas?",
"How much in losses does fraud account for in property & casualty insurance?",
"Do pay-as-you-drive insurance policies have an impact on greenhouse gas emissions? How much?",
"What was the most costly earthquake in US history for insurers?",
"Does it matter who is at fault to be compensated when injured on the job?",
]
Set up Evaluation¶
In [ ]:
Copied!
import numpy as np
from trulens.core import Feedback
from trulens.core import TruSession
from trulens.apps.llamaindex import TruLlama
from trulens.providers.openai import OpenAI as fOpenAI
session = TruSession()
# start fresh
session.reset_database()
provider = fOpenAI()
context = TruLlama.select_context()
answer_relevance = Feedback(
provider.relevance_with_cot_reasons, name="Answer Relevance"
).on_input_output()
context_relevance = (
Feedback(
provider.context_relevance_with_cot_reasons, name="Context Relevance"
)
.on_input()
.on(context)
.aggregate(np.mean)
)
import numpy as np
from trulens.core import Feedback
from trulens.core import TruSession
from trulens.apps.llamaindex import TruLlama
from trulens.providers.openai import OpenAI as fOpenAI
session = TruSession()
# start fresh
session.reset_database()
provider = fOpenAI()
context = TruLlama.select_context()
answer_relevance = Feedback(
provider.relevance_with_cot_reasons, name="Answer Relevance"
).on_input_output()
context_relevance = (
Feedback(
provider.context_relevance_with_cot_reasons, name="Context Relevance"
)
.on_input()
.on(context)
.aggregate(np.mean)
)
In [ ]:
Copied!
# embedding distance
from langchain.embeddings.openai import OpenAIEmbeddings
from trulens.feedback.embeddings import Embeddings
model_name = "text-embedding-ada-002"
embed_model = OpenAIEmbeddings(
model=model_name, openai_api_key=os.environ["OPENAI_API_KEY"]
)
embed = Embeddings(embed_model=embed_model)
f_embed_dist = Feedback(embed.cosine_distance).on_input().on(context)
f_groundedness = (
Feedback(
provider.groundedness_measure_with_cot_reasons, name="Groundedness"
)
.on(context.collect())
.on_output()
)
honest_feedbacks = [
answer_relevance,
context_relevance,
f_embed_dist,
f_groundedness,
]
tru_recorder_rag_basic = TruLlama(
rag_basic, app_name="RAG", app_version="1_baseline", feedbacks=honest_feedbacks
)
# embedding distance
from langchain.embeddings.openai import OpenAIEmbeddings
from trulens.feedback.embeddings import Embeddings
model_name = "text-embedding-ada-002"
embed_model = OpenAIEmbeddings(
model=model_name, openai_api_key=os.environ["OPENAI_API_KEY"]
)
embed = Embeddings(embed_model=embed_model)
f_embed_dist = Feedback(embed.cosine_distance).on_input().on(context)
f_groundedness = (
Feedback(
provider.groundedness_measure_with_cot_reasons, name="Groundedness"
)
.on(context.collect())
.on_output()
)
honest_feedbacks = [
answer_relevance,
context_relevance,
f_embed_dist,
f_groundedness,
]
tru_recorder_rag_basic = TruLlama(
rag_basic, app_name="RAG", app_version="1_baseline", feedbacks=honest_feedbacks
)
In [ ]:
Copied!
from trulens.dashboard import run_dashboard
run_dashboard(session)
from trulens.dashboard import run_dashboard
run_dashboard(session)
In [ ]:
Copied!
# Run evaluation on 10 sample questions
with tru_recorder_rag_basic as recording:
for question in honest_evals:
response = rag_basic.query(question)
# Run evaluation on 10 sample questions
with tru_recorder_rag_basic as recording:
for question in honest_evals:
response = rag_basic.query(question)
In [ ]:
Copied!
session.get_leaderboard(app_ids=[tru_recorder_rag_basic.app_id])
session.get_leaderboard(app_ids=[tru_recorder_rag_basic.app_id])
Our simple RAG often struggles with retrieving not enough information from the insurance manual to properly answer the question. The information needed may be just outside the chunk that is identified and retrieved by our app.