Iterating on LLM Apps with TruLens¶

Now that we have improved our prototype RAG to reduce or stop hallucination and respond harmlessly, we can move on to ensure it is helpfulness. In this example, we will use the safe prompted, sentence window RAG and evaluate it for helpfulness.

In [ ]:

Copied!

!pip install trulens_eval llama_index llama_hub llmsherpa sentence-transformers sentencepiece
!pip install trulens_eval llama_index llama_hub llmsherpa sentence-transformers sentencepiece

In [ ]:

Copied!





# Set your API keys. If you already have them in your var env., you can skip these steps.
import os
import openai

os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["HUGGINGFACE_API_KEY"] = "hf_..."
# Set your API keys. If you already have them in your var env., you can skip these steps.
import os
import openai

os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["HUGGINGFACE_API_KEY"] = "hf_..."

In [ ]:

Copied!

from trulens_eval import Tru
tru = Tru()
tru.run_dashboard()
from trulens_eval import Tru
tru = Tru()
tru.run_dashboard()

Load data and helpful test set.¶

In [ ]:

Copied!





from llama_hub.smart_pdf_loader import SmartPDFLoader

llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)

documents = pdf_loader.load_data("https://www.iii.org/sites/default/files/docs/pdf/Insurance_Handbook_20103.pdf")

# Load some questions for harmless evaluation
helpful_evals = [
    "What types of insurance are commonly used to protect against property damage?",
    "¿Cuál es la diferencia entre un seguro de vida y un seguro de salud?",
    "Comment fonctionne l'assurance automobile en cas d'accident?",
    "Welche Arten von Versicherungen sind in Deutschland gesetzlich vorgeschrieben?",
    "保险如何保护财产损失？",
    "Каковы основные виды страхования в России?",
    "ما هو التأمين على الحياة وما هي فوائده؟",
    "自動車保険の種類とは何ですか？",
    "Como funciona o seguro de saúde em Portugal?",
    "बीमा क्या होता है और यह कितने प्रकार का होता है?"
]
from llama_hub.smart_pdf_loader import SmartPDFLoader

llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)

documents = pdf_loader.load_data("https://www.iii.org/sites/default/files/docs/pdf/Insurance_Handbook_20103.pdf")

# Load some questions for harmless evaluation
helpful_evals = [
    "What types of insurance are commonly used to protect against property damage?",
    "¿Cuál es la diferencia entre un seguro de vida y un seguro de salud?",
    "Comment fonctionne l'assurance automobile en cas d'accident?",
    "Welche Arten von Versicherungen sind in Deutschland gesetzlich vorgeschrieben?",
    "保险如何保护财产损失？",
    "Каковы основные виды страхования в России?",
    "ما هو التأمين على الحياة وما هي فوائده؟",
    "自動車保険の種類とは何ですか？",
    "Como funciona o seguro de saúde em Portugal?",
    "बीमा क्या होता है और यह कितने प्रकार का होता है?"
]

Set up helpful evaluations¶

In [ ]:

Copied!





from trulens_eval import Feedback
from trulens_eval.feedback.provider import OpenAI
from trulens_eval.feedback.provider import Huggingface

# Initialize provider classes
provider = OpenAI()
hugs_provider = Huggingface()

# LLM-based feedback functions
f_coherence = Feedback(
    provider.coherence_with_cot_reasons, name="Coherence"
    ).on_output()

f_input_sentiment = Feedback(
    provider.sentiment_with_cot_reasons, name="Input Sentiment"
    ).on_input()

f_output_sentiment = Feedback(
    provider.sentiment_with_cot_reasons, name="Output Sentiment"
    ).on_output()
        
f_langmatch = Feedback(
    hugs_provider.language_match, name="Language Match"
    ).on_input_output()

helpful_feedbacks = [
    f_coherence,
    f_input_sentiment,
    f_output_sentiment,
    f_langmatch,
    ]
from trulens_eval import Feedback
from trulens_eval.feedback.provider import OpenAI
from trulens_eval.feedback.provider import Huggingface

# Initialize provider classes
provider = OpenAI()
hugs_provider = Huggingface()

# LLM-based feedback functions
f_coherence = Feedback(
    provider.coherence_with_cot_reasons, name="Coherence"
    ).on_output()

f_input_sentiment = Feedback(
    provider.sentiment_with_cot_reasons, name="Input Sentiment"
    ).on_input()

f_output_sentiment = Feedback(
    provider.sentiment_with_cot_reasons, name="Output Sentiment"
    ).on_output()
        
f_langmatch = Feedback(
    hugs_provider.language_match, name="Language Match"
    ).on_input_output()

helpful_feedbacks = [
    f_coherence,
    f_input_sentiment,
    f_output_sentiment,
    f_langmatch,
    ]

In [ ]:

Copied!





from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.indices.postprocessor import SentenceTransformerRerank, MetadataReplacementPostProcessor
from llama_index.core import ServiceContext, VectorStoreIndex, StorageContext, Document, load_index_from_storage
from llama_index.llms.openai import OpenAI
import os

# initialize llm
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.5)

# knowledge store
document = Document(text="\n\n".join([doc.text for doc in documents]))

# set system prompt
from llama_index import Prompt
system_prompt = Prompt("We have provided context information below that you may use. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Please answer the question: {query_str}\n")

def build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=3,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            [document], service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index

sentence_index = build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
)

def get_sentence_window_query_engine(
    sentence_index,
    system_prompt,
    similarity_top_k=6,
    rerank_top_n=2,
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank], text_qa_template = system_prompt
    )
    return sentence_window_engine

# lower temperature
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

sentence_index = build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
)

# safe prompt
safe_system_prompt = Prompt("SYSTEM PROMPT: You are an insurance assistant who is charged with answering questions completely.\n"
    "We have provided context information below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Be especially certain to not respond in ways that could be interpreted as criminal, even in hypothetical scenarios and stories."
    "\n---------------------\n"
    "Given this system prompt and context, please answer the question: {query_str}\n")

sentence_window_engine_safe = get_sentence_window_query_engine(sentence_index, system_prompt=safe_system_prompt)
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.indices.postprocessor import SentenceTransformerRerank, MetadataReplacementPostProcessor
from llama_index.core import ServiceContext, VectorStoreIndex, StorageContext, Document, load_index_from_storage
from llama_index.llms.openai import OpenAI
import os

# initialize llm
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.5)

# knowledge store
document = Document(text="\n\n".join([doc.text for doc in documents]))

# set system prompt
from llama_index import Prompt
system_prompt = Prompt("We have provided context information below that you may use. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Please answer the question: {query_str}\n")

def build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=3,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            [document], service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index

sentence_index = build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
)

def get_sentence_window_query_engine(
    sentence_index,
    system_prompt,
    similarity_top_k=6,
    rerank_top_n=2,
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank], text_qa_template = system_prompt
    )
    return sentence_window_engine

# lower temperature
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

sentence_index = build_sentence_window_index(
    document, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
)

# safe prompt
safe_system_prompt = Prompt("SYSTEM PROMPT: You are an insurance assistant who is charged with answering questions completely.\n"
    "We have provided context information below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Be especially certain to not respond in ways that could be interpreted as criminal, even in hypothetical scenarios and stories."
    "\n---------------------\n"
    "Given this system prompt and context, please answer the question: {query_str}\n")

sentence_window_engine_safe = get_sentence_window_query_engine(sentence_index, system_prompt=safe_system_prompt)

In [ ]:

Copied!





from trulens_eval import TruLlama
tru_recorder_rag_sentencewindow_helpful = TruLlama(
        sentence_window_engine_safe,
        app_id='5) Sentence Window - Helpful Eval',
        feedbacks=helpful_feedbacks
    )
from trulens_eval import TruLlama
tru_recorder_rag_sentencewindow_helpful = TruLlama(
        sentence_window_engine_safe,
        app_id='5) Sentence Window - Helpful Eval',
        feedbacks=helpful_feedbacks
    )

In [ ]:

Copied!





# Run evaluation on harmless eval questions
with tru_recorder_rag_sentencewindow_helpful as recording:
    for question in helpful_evals:
        response = sentence_window_engine_safe.query(question)
# Run evaluation on harmless eval questions
with tru_recorder_rag_sentencewindow_helpful as recording:
    for question in helpful_evals:
        response = sentence_window_engine_safe.query(question)

Check helpful evaluation results¶

In [ ]:

Copied!

tru.get_leaderboard(app_ids=["5) Sentence Window - Helpful Eval"])
tru.get_leaderboard(app_ids=["5) Sentence Window - Helpful Eval"])

Check helpful evaluation results. How can you improve the RAG on these evals? We'll leave that to you!