๐ Ground Truth Evaluationsยถ
In this quickstart you will create a evaluate an app using ground truth. Ground truth evaluation can be especially useful during early LLM experiments when you have a small set of example queries that are critical to get right.
Ground truth evaluation works by comparing the similarity of an LLM response compared to its matching verified response.
Add API keysยถ
For this quickstart, you will need Open AI keys.
Inย [ย ]:
Copied!
# !pip install trulens trulens-providers-openai openai
# !pip install trulens trulens-providers-openai openai
Inย [ย ]:
Copied!
import os
if "OPENAI_API_KEY" not in os.environ:
os.environ["OPENAI_API_KEY"] = "sk-proj-..."
os.environ["TRULENS_OTEL_TRACING"] = "1"
import os
if "OPENAI_API_KEY" not in os.environ:
os.environ["OPENAI_API_KEY"] = "sk-proj-..."
os.environ["TRULENS_OTEL_TRACING"] = "1"
Inย [ย ]:
Copied!
from trulens.core import TruSession
session = TruSession()
from trulens.core import TruSession
session = TruSession()
Create Simple LLM Applicationยถ
Inย [ย ]:
Copied!
from openai import OpenAI
from trulens.core.otel.instrument import instrument
oai_client = OpenAI()
class APP:
@instrument()
def completion(self, prompt):
completion = (
oai_client.chat.completions.create(
model="gpt-3.5-turbo",
temperature=0,
messages=[
{
"role": "user",
"content": f"Please answer the question: {prompt}",
}
],
)
.choices[0]
.message.content
)
return completion
llm_app = APP()
from openai import OpenAI
from trulens.core.otel.instrument import instrument
oai_client = OpenAI()
class APP:
@instrument()
def completion(self, prompt):
completion = (
oai_client.chat.completions.create(
model="gpt-3.5-turbo",
temperature=0,
messages=[
{
"role": "user",
"content": f"Please answer the question: {prompt}",
}
],
)
.choices[0]
.message.content
)
return completion
llm_app = APP()
Initialize Feedback Function(s)ยถ
Inย [ย ]:
Copied!
from trulens.core import Feedback
from trulens.feedback import GroundTruthAgreement
from trulens.providers.openai import OpenAI as fOpenAI
golden_set = [
{
"query": "who invented the lightbulb?",
"expected_response": "Thomas Edison",
},
{
"query": "ยฟquien invento la bombilla?",
"expected_response": "Thomas Edison",
},
]
f_groundtruth = Feedback(
GroundTruthAgreement(golden_set, provider=fOpenAI()).agreement_measure,
name="Ground Truth Semantic Agreement",
).on_input_output()
from trulens.core import Feedback
from trulens.feedback import GroundTruthAgreement
from trulens.providers.openai import OpenAI as fOpenAI
golden_set = [
{
"query": "who invented the lightbulb?",
"expected_response": "Thomas Edison",
},
{
"query": "ยฟquien invento la bombilla?",
"expected_response": "Thomas Edison",
},
]
f_groundtruth = Feedback(
GroundTruthAgreement(golden_set, provider=fOpenAI()).agreement_measure,
name="Ground Truth Semantic Agreement",
).on_input_output()
Instrument chain for logging with TruLensยถ
Inย [ย ]:
Copied!
# add trulens as a context manager for llm_app
from trulens.apps.app import TruApp
tru_app = TruApp(
llm_app, app_name="LLM App", app_version="v1", feedbacks=[f_groundtruth]
)
# add trulens as a context manager for llm_app
from trulens.apps.app import TruApp
tru_app = TruApp(
llm_app, app_name="LLM App", app_version="v1", feedbacks=[f_groundtruth]
)
Inย [ย ]:
Copied!
# Instrumented query engine can operate as a context manager:
with tru_app as recording:
llm_app.completion("ยฟquien invento la bombilla?")
llm_app.completion("who invented the lightbulb?")
# Instrumented query engine can operate as a context manager:
with tru_app as recording:
llm_app.completion("ยฟquien invento la bombilla?")
llm_app.completion("who invented the lightbulb?")
See resultsยถ
Inย [ย ]:
Copied!
session.get_leaderboard(app_ids=[tru_app.app_id])
session.get_leaderboard(app_ids=[tru_app.app_id])
Inย [ย ]:
Copied!
from trulens.dashboard import run_dashboard
run_dashboard(session=session)
from trulens.dashboard import run_dashboard
run_dashboard(session=session)