📓 Ground Truth Evaluations¶
In this quickstart you will create a evaluate a LangChain app using ground truth. Ground truth evaluation can be especially useful during early LLM experiments when you have a small set of example queries that are critical to get right.
Ground truth evaluation works by comparing the similarity of an LLM response compared to its matching verified response.
Add API keys¶
For this quickstart, you will need Open AI keys.
In [ ]:
Copied!
# !pip install trulens trulens-provider-openai openai
# !pip install trulens trulens-provider-openai openai
In [ ]:
Copied!
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
In [ ]:
Copied!
from trulens.core import TruSession
session = TruSession()
from trulens.core import TruSession
session = TruSession()
Create Simple LLM Application¶
In [ ]:
Copied!
from openai import OpenAI
from trulens.apps.custom import instrument
oai_client = OpenAI()
class APP:
@instrument
def completion(self, prompt):
completion = (
oai_client.chat.completions.create(
model="gpt-3.5-turbo",
temperature=0,
messages=[
{
"role": "user",
"content": f"Please answer the question: {prompt}",
}
],
)
.choices[0]
.message.content
)
return completion
llm_app = APP()
from openai import OpenAI
from trulens.apps.custom import instrument
oai_client = OpenAI()
class APP:
@instrument
def completion(self, prompt):
completion = (
oai_client.chat.completions.create(
model="gpt-3.5-turbo",
temperature=0,
messages=[
{
"role": "user",
"content": f"Please answer the question: {prompt}",
}
],
)
.choices[0]
.message.content
)
return completion
llm_app = APP()
Initialize Feedback Function(s)¶
In [ ]:
Copied!
from trulens.core import Feedback
from trulens.feedback import GroundTruthAgreement
from trulens.providers.openai import OpenAI as fOpenAI
golden_set = [
{
"query": "who invented the lightbulb?",
"expected_response": "Thomas Edison",
},
{
"query": "¿quien invento la bombilla?",
"expected_response": "Thomas Edison",
},
]
f_groundtruth = Feedback(
GroundTruthAgreement(golden_set, provider=fOpenAI()).agreement_measure,
name="Ground Truth Semantic Agreement",
).on_input_output()
from trulens.core import Feedback
from trulens.feedback import GroundTruthAgreement
from trulens.providers.openai import OpenAI as fOpenAI
golden_set = [
{
"query": "who invented the lightbulb?",
"expected_response": "Thomas Edison",
},
{
"query": "¿quien invento la bombilla?",
"expected_response": "Thomas Edison",
},
]
f_groundtruth = Feedback(
GroundTruthAgreement(golden_set, provider=fOpenAI()).agreement_measure,
name="Ground Truth Semantic Agreement",
).on_input_output()
Instrument chain for logging with TruLens¶
In [ ]:
Copied!
# add trulens as a context manager for llm_app
from trulens.apps.custom import TruCustomApp
tru_app = TruCustomApp(
llm_app, app_name="LLM App", app_version="v1", feedbacks=[f_groundtruth]
)
# add trulens as a context manager for llm_app
from trulens.apps.custom import TruCustomApp
tru_app = TruCustomApp(
llm_app, app_name="LLM App", app_version="v1", feedbacks=[f_groundtruth]
)
In [ ]:
Copied!
# Instrumented query engine can operate as a context manager:
with tru_app as recording:
llm_app.completion("¿quien invento la bombilla?")
llm_app.completion("who invented the lightbulb?")
# Instrumented query engine can operate as a context manager:
with tru_app as recording:
llm_app.completion("¿quien invento la bombilla?")
llm_app.completion("who invented the lightbulb?")
See results¶
In [ ]:
Copied!
session.get_leaderboard(app_ids=[tru_app.app_id])
session.get_leaderboard(app_ids=[tru_app.app_id])