📓 Evaluate Streaming Apps¶
This notebook shows how to trace and evaluate an app with streaming generation.
It also shows the use of the dummy feedback function provider which behaves like the HuggingFace provider except it does not actually perform any network calls and just produces constant results. It can be used to prototype feedback function wiring for your apps before invoking potentially slow (to run/to load) feedback functions.
Import libraries¶
In [ ]:
Copied!
# !pip install trulens trulens-providers-huggingface
# !pip install trulens trulens-providers-huggingface
Set keys¶
In [ ]:
Copied!
import os
if "OPENAI_API_KEY" not in os.environ:
os.environ["OPENAI_API_KEY"] = "sk-proj-..."
os.environ["TRULENS_OTEL_TRACING"] = "1"
import os
if "OPENAI_API_KEY" not in os.environ:
os.environ["OPENAI_API_KEY"] = "sk-proj-..."
os.environ["TRULENS_OTEL_TRACING"] = "1"
In [ ]:
Copied!
from trulens.core import Feedback
from trulens.core import TruSession
session = TruSession()
session.reset_database()
from trulens.core import Feedback
from trulens.core import TruSession
session = TruSession()
session.reset_database()
In [ ]:
Copied!
from trulens.dashboard import run_dashboard
run_dashboard(session)
from trulens.dashboard import run_dashboard
run_dashboard(session)
Build the app¶
In [ ]:
Copied!
from openai import OpenAI
from trulens.apps.app import instrument
oai_client = OpenAI()
class APP:
@instrument
def stream_completion(self, prompt):
completion = oai_client.chat.completions.create(
model="gpt-4.1-mini",
stream=True,
stream_options={
"include_usage": True
}, # not yet tracked by trulens
temperature=0,
messages=[
{
"role": "user",
"content": f"Please answer the question: {prompt}",
}
],
)
for chunk in completion:
if (
len(choices := chunk.choices) > 0
and (content := choices[0].delta.content) is not None
):
yield content
llm_app = APP()
from openai import OpenAI
from trulens.apps.app import instrument
oai_client = OpenAI()
class APP:
@instrument
def stream_completion(self, prompt):
completion = oai_client.chat.completions.create(
model="gpt-4.1-mini",
stream=True,
stream_options={
"include_usage": True
}, # not yet tracked by trulens
temperature=0,
messages=[
{
"role": "user",
"content": f"Please answer the question: {prompt}",
}
],
)
for chunk in completion:
if (
len(choices := chunk.choices) > 0
and (content := choices[0].delta.content) is not None
):
yield content
llm_app = APP()
Create dummy feedback¶
By setting the provider as Dummy()
, you can erect your evaluation suite and then easily substitute in a real model provider (e.g. OpenAI) later.
In [ ]:
Copied!
from trulens.providers.huggingface.provider import Dummy
hugs = Dummy()
f_positive_sentiment = Feedback(hugs.positive_sentiment).on_output()
from trulens.providers.huggingface.provider import Dummy
hugs = Dummy()
f_positive_sentiment = Feedback(hugs.positive_sentiment).on_output()
Create the app¶
In [ ]:
Copied!
# add trulens as a context manager for llm_app with dummy feedback
from trulens.apps.app import TruApp
tru_app = TruApp(
llm_app,
app_name="LLM App",
app_version="v1",
feedbacks=[f_positive_sentiment],
)
# add trulens as a context manager for llm_app with dummy feedback
from trulens.apps.app import TruApp
tru_app = TruApp(
llm_app,
app_name="LLM App",
app_version="v1",
feedbacks=[f_positive_sentiment],
)
Run the app¶
In [ ]:
Copied!
with tru_app as recording:
for chunk in llm_app.stream_completion(
"give me a good name for a colorful sock company and the store behind its founding"
):
print(chunk, end="")
record = recording.get()
with tru_app as recording:
for chunk in llm_app.stream_completion(
"give me a good name for a colorful sock company and the store behind its founding"
):
print(chunk, end="")
record = recording.get()
In [ ]:
Copied!
# Check full output:
record.main_output
# Check full output:
record.main_output
In [ ]:
Copied!
session.get_leaderboard(app_ids=[tru_app.app_id])
session.get_leaderboard(app_ids=[tru_app.app_id])