📓 Evaluate Streaming Apps¶

This notebook shows how to trace and evaluate an app with streaming generation.

It also shows the use of the dummy feedback function provider which behaves like the HuggingFace provider except it does not actually perform any network calls and just produces constant results. It can be used to prototype feedback function wiring for your apps before invoking potentially slow (to run/to load) feedback functions.

Import libraries¶

In [ ]:

Copied!

# !pip install trulens trulens-providers-huggingface
# !pip install trulens trulens-providers-huggingface

Set keys¶

In [ ]:

Copied!

import os

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = "sk-proj-..."
import os

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = "sk-proj-..."

In [ ]:

Copied!

from trulens.core import Feedback
from trulens.core import TruSession

session = TruSession()
session.reset_database()
from trulens.core import Feedback
from trulens.core import TruSession

session = TruSession()
session.reset_database()

In [ ]:

Copied!

from trulens.dashboard import run_dashboard

run_dashboard(session)
from trulens.dashboard import run_dashboard

run_dashboard(session)

Build the app¶

In [ ]:

Copied!





from openai import OpenAI
from trulens.apps.app import instrument

oai_client = OpenAI()


class APP:
    @instrument
    def stream_completion(self, prompt):
        completion = oai_client.chat.completions.create(
            model="gpt-4.1-mini",
            stream=True,
            stream_options={
                "include_usage": True
            },  # not yet tracked by trulens
            temperature=0,
            messages=[
                {
                    "role": "user",
                    "content": f"Please answer the question: {prompt}",
                }
            ],
        )
        for chunk in completion:
            if (
                len(choices := chunk.choices) > 0
                and (content := choices[0].delta.content) is not None
            ):
                yield content


llm_app = APP()
from openai import OpenAI
from trulens.apps.app import instrument

oai_client = OpenAI()


class APP:
    @instrument
    def stream_completion(self, prompt):
        completion = oai_client.chat.completions.create(
            model="gpt-4.1-mini",
            stream=True,
            stream_options={
                "include_usage": True
            },  # not yet tracked by trulens
            temperature=0,
            messages=[
                {
                    "role": "user",
                    "content": f"Please answer the question: {prompt}",
                }
            ],
        )
        for chunk in completion:
            if (
                len(choices := chunk.choices) > 0
                and (content := choices[0].delta.content) is not None
            ):
                yield content


llm_app = APP()

Create dummy feedback¶

By setting the provider as Dummy(), you can erect your evaluation suite and then easily substitute in a real model provider (e.g. OpenAI) later.

In [ ]:

Copied!

from trulens.providers.huggingface.provider import Dummy

hugs = Dummy()

f_positive_sentiment = Feedback(hugs.positive_sentiment).on_output()
from trulens.providers.huggingface.provider import Dummy

hugs = Dummy()

f_positive_sentiment = Feedback(hugs.positive_sentiment).on_output()

Create the app¶

In [ ]:

Copied!





# add trulens as a context manager for llm_app with dummy feedback
from trulens.apps.app import TruApp

tru_app = TruApp(
    llm_app,
    app_name="LLM App",
    app_version="v1",
    feedbacks=[f_positive_sentiment],
)
# add trulens as a context manager for llm_app with dummy feedback
from trulens.apps.app import TruApp

tru_app = TruApp(
    llm_app,
    app_name="LLM App",
    app_version="v1",
    feedbacks=[f_positive_sentiment],
)

Run the app¶

In [ ]:

Copied!





with tru_app as recording:
    for chunk in llm_app.stream_completion(
        "give me a good name for a colorful sock company and the store behind its founding"
    ):
        print(chunk, end="")

record = recording.get()
with tru_app as recording:
    for chunk in llm_app.stream_completion(
        "give me a good name for a colorful sock company and the store behind its founding"
    ):
        print(chunk, end="")

record = recording.get()

In [ ]:

Copied!

# Check full output:

record.main_output
# Check full output:

record.main_output

In [ ]:

Copied!

session.get_leaderboard(app_ids=[tru_app.app_id])
session.get_leaderboard(app_ids=[tru_app.app_id])