Evaluating Sentence Window RAG¶

In this notebook, we use the SentenceWindowNodeParser to parse documents into single sentences per node. Each node also contains a "window" with the sentences on either side of the node sentence.

Then, after retrieval, before passing the retrieved sentences to the LLM, the single sentences are replaced with a window containing the surrounding sentences using the MetadataReplacementNodePostProcessor.

Last we will show how to evaluate retrieval in this setting, and compare to base VectorStoreIndex.

In [ ]:

Copied!

# !pip install trulens trulens-apps-llamaindex trulens-providers-openai llama_index==0.10.11 sentence-transformers transformers pypdf gdown
# !pip install trulens trulens-apps-llamaindex trulens-providers-openai llama_index==0.10.11 sentence-transformers transformers pypdf gdown

In [ ]:

Copied!

import os

os.environ["TRULENS_OTEL_TRACING"] = "0"
import os

os.environ["TRULENS_OTEL_TRACING"] = "0"

Query Engine Construction¶

In [ ]:

Copied!





import os

import openai
from trulens.core import Feedback
from trulens.core import FeedbackMode
from trulens.core import Select
from trulens.core import TruSession
from trulens.apps.llamaindex import TruLlama
from trulens.providers.openai import OpenAI as fOpenAI

session = TruSession()

session.reset_database()

os.environ["OPENAI_API_KEY"] = "sk-proj-..."
openai.api_key = os.environ["OPENAI_API_KEY"]
import os

import openai
from trulens.core import Feedback
from trulens.core import FeedbackMode
from trulens.core import Select
from trulens.core import TruSession
from trulens.apps.llamaindex import TruLlama
from trulens.providers.openai import OpenAI as fOpenAI

session = TruSession()

session.reset_database()

os.environ["OPENAI_API_KEY"] = "sk-proj-..."
openai.api_key = os.environ["OPENAI_API_KEY"]

In [ ]:

Copied!

!curl https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_Chapter03.pdf --output IPCC_AR6_WGII_Chapter03.pdf
!curl https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_Chapter03.pdf --output IPCC_AR6_WGII_Chapter03.pdf

In [ ]:

Copied!

from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./IPCC_AR6_WGII_Chapter03.pdf"]
).load_data()
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./IPCC_AR6_WGII_Chapter03.pdf"]
).load_data()

In [ ]:

Copied!

# sentence-window index
!gdown "https://drive.google.com/uc?id=16pH4NETEs43dwJUvYnJ9Z-bsR9_krkrP"
!tar -xzf sentence_index.tar.gz
# sentence-window index
!gdown "https://drive.google.com/uc?id=16pH4NETEs43dwJUvYnJ9Z-bsR9_krkrP"
!tar -xzf sentence_index.tar.gz

In [ ]:

Copied!

# Merge into a single large document rather than one document per-page
from llama_index.core import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))
# Merge into a single large document rather than one document per-page
from llama_index.core import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))

In [ ]:

Copied!





from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.node_parser import SentenceSplitter

# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

llm = OpenAI(model="gpt-4o-mini", temperature=0.1)
text_splitter = SentenceSplitter()


Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.node_parser = node_parser
Settings.text_splitter = text_splitter
Settings.llm = llm
from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.node_parser import SentenceSplitter

# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

llm = OpenAI(model="gpt-4o-mini", temperature=0.1)
text_splitter = SentenceSplitter()


Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.node_parser = node_parser
Settings.text_splitter = text_splitter
Settings.llm = llm

In [ ]:

Copied!

from llama_index.core import VectorStoreIndex

nodes = node_parser.get_nodes_from_documents(documents)
base_nodes = text_splitter.get_nodes_from_documents(documents)

sentence_index = VectorStoreIndex(nodes)

base_index = VectorStoreIndex(base_nodes)
from llama_index.core import VectorStoreIndex

nodes = node_parser.get_nodes_from_documents(documents)
base_nodes = text_splitter.get_nodes_from_documents(documents)

sentence_index = VectorStoreIndex(nodes)

base_index = VectorStoreIndex(base_nodes)

Define Evals¶

In [ ]:

Copied!





import numpy as np

# Initialize OpenAI provider
provider = fOpenAI()

# Helpfulness
f_helpfulness = Feedback(provider.helpfulness).on_output()

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(provider.relevance_with_cot_reasons).on_input_output()

# Question/statement relevance between question and each context chunk with context reasoning.
# The context is located in a different place for the sub questions so we need to define that feedback separately
f_context_relevance_subquestions = (
    Feedback(provider.context_relevance_with_cot_reasons)
    .on_input()
    .on(Select.Record.calls[0].rets.source_nodes[:].node.text)
    .aggregate(np.mean)
)

f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons)
    .on_input()
    .on(TruLlama.select_context())
    .aggregate(np.mean)
)

# Initialize groundedness
# Groundedness with chain of thought reasoning
# Similar to context relevance, we'll follow a strategy of defining it twice for the subquestions and overall question.
f_groundedness_subquestions = (
    Feedback(provider.groundedness_measure_with_cot_reasons)
    .on(Select.Record.calls[0].rets.source_nodes[:].node.text.collect())
    .on_output()
)

f_groundedness = (
    Feedback(provider.groundedness_measure_with_cot_reasons)
    .on(TruLlama.select_context())
    .on_output()
)
import numpy as np

# Initialize OpenAI provider
provider = fOpenAI()

# Helpfulness
f_helpfulness = Feedback(provider.helpfulness).on_output()

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(provider.relevance_with_cot_reasons).on_input_output()

# Question/statement relevance between question and each context chunk with context reasoning.
# The context is located in a different place for the sub questions so we need to define that feedback separately
f_context_relevance_subquestions = (
    Feedback(provider.context_relevance_with_cot_reasons)
    .on_input()
    .on(Select.Record.calls[0].rets.source_nodes[:].node.text)
    .aggregate(np.mean)
)

f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons)
    .on_input()
    .on(TruLlama.select_context())
    .aggregate(np.mean)
)

# Initialize groundedness
# Groundedness with chain of thought reasoning
# Similar to context relevance, we'll follow a strategy of defining it twice for the subquestions and overall question.
f_groundedness_subquestions = (
    Feedback(provider.groundedness_measure_with_cot_reasons)
    .on(Select.Record.calls[0].rets.source_nodes[:].node.text.collect())
    .on_output()
)

f_groundedness = (
    Feedback(provider.groundedness_measure_with_cot_reasons)
    .on(TruLlama.select_context())
    .on_output()
)

Querying¶

In [ ]:

Copied!





from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from trulens.apps.llamaindex import TruLlama

sentence_query_engine = sentence_index.as_query_engine(
    similarity_top_k=2,
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

tru_sentence_query_engine_recorder = TruLlama(
    sentence_query_engine,
    app_name="climate query engine",
    app_version="sentence_window_index",
    feedbacks=[
        f_qa_relevance,
        f_context_relevance,
        f_groundedness,
        f_helpfulness,
    ],
)
with tru_sentence_query_engine_recorder:
    sentence_query_engine.query("What are the concerns surrounding the AMOC?")
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from trulens.apps.llamaindex import TruLlama

sentence_query_engine = sentence_index.as_query_engine(
    similarity_top_k=2,
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

tru_sentence_query_engine_recorder = TruLlama(
    sentence_query_engine,
    app_name="climate query engine",
    app_version="sentence_window_index",
    feedbacks=[
        f_qa_relevance,
        f_context_relevance,
        f_groundedness,
        f_helpfulness,
    ],
)
with tru_sentence_query_engine_recorder:
    sentence_query_engine.query("What are the concerns surrounding the AMOC?")

Contrast with normal VectorStoreIndex¶

In [ ]:

Copied!





query_engine = base_index.as_query_engine(similarity_top_k=2)

tru_query_engine_recorder = TruLlama(
    query_engine,
    app_name="climate query engine",
    app_version="vector_store_index",
    feedbacks=[
        f_qa_relevance,
        f_context_relevance,
        f_groundedness,
        f_helpfulness,
    ],
)
with tru_query_engine_recorder:
    query_engine.query("What are the concerns surrounding the AMOC?")
query_engine = base_index.as_query_engine(similarity_top_k=2)

tru_query_engine_recorder = TruLlama(
    query_engine,
    app_name="climate query engine",
    app_version="vector_store_index",
    feedbacks=[
        f_qa_relevance,
        f_context_relevance,
        f_groundedness,
        f_helpfulness,
    ],
)
with tru_query_engine_recorder:
    query_engine.query("What are the concerns surrounding the AMOC?")

Also Compare with Sub-Question Query Engine + Sentence Window Engine¶

In [ ]:

Copied!





from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.tools import QueryEngineTool
from llama_index.core.tools import ToolMetadata

subquestion_query_engine = SubQuestionQueryEngine.from_defaults(
    [
        QueryEngineTool(
            query_engine=sentence_query_engine,
            metadata=ToolMetadata(
                name="climate_report", description="Climate Report on Oceans."
            ),
        )
    ],
    verbose=False,
)

tru_subquestion_query_engine_recorder = TruLlama(
    subquestion_query_engine,
    app_name="climate query engine",
    app_version="sub_question_query_engine",
    feedbacks=[
        f_qa_relevance,
        f_context_relevance,
        f_context_relevance_subquestions,
        f_groundedness,
        f_groundedness_subquestions,
        f_helpfulness,
    ],
)
with tru_subquestion_query_engine_recorder:
    subquestion_query_engine.query("What are the concerns surrounding the AMOC?")
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.tools import QueryEngineTool
from llama_index.core.tools import ToolMetadata

subquestion_query_engine = SubQuestionQueryEngine.from_defaults(
    [
        QueryEngineTool(
            query_engine=sentence_query_engine,
            metadata=ToolMetadata(
                name="climate_report", description="Climate Report on Oceans."
            ),
        )
    ],
    verbose=False,
)

tru_subquestion_query_engine_recorder = TruLlama(
    subquestion_query_engine,
    app_name="climate query engine",
    app_version="sub_question_query_engine",
    feedbacks=[
        f_qa_relevance,
        f_context_relevance,
        f_context_relevance_subquestions,
        f_groundedness,
        f_groundedness_subquestions,
        f_helpfulness,
    ],
)
with tru_subquestion_query_engine_recorder:
    subquestion_query_engine.query("What are the concerns surrounding the AMOC?")

In [ ]:

Copied!

from trulens.dashboard import run_dashboard

run_dashboard(session)
from trulens.dashboard import run_dashboard

run_dashboard(session)

In [ ]:

Copied!





questions = [
    "Based on the provided text, discuss the impact of human activities on the natural carbon dynamics of estuaries, shelf seas, and other intertidal and shallow-water habitats. Provide examples from the text to support your answer.",
    "Analyze the combined effects of exploitation and multi-decadal climate fluctuations on global fisheries yields. How do these factors make it difficult to assess the impacts of global climate change on fisheries yields? Use specific examples from the text to support your analysis.",
    "Based on the study by Gutiérrez-Rodríguez, A.G., et al., 2018, what potential benefits do seaweeds have in the field of medicine, specifically in relation to cancer treatment?",
    "According to the research conducted by Haasnoot, M., et al., 2020, how does the uncertainty in Antarctic mass-loss impact the coastal adaptation strategy of the Netherlands?",
    "Based on the context, explain how the decline in warm water coral reefs is projected to impact the services they provide to society, particularly in terms of coastal protection.",
    "Tell me something about the intricacies of tying a tie.",
]
questions = [
    "Based on the provided text, discuss the impact of human activities on the natural carbon dynamics of estuaries, shelf seas, and other intertidal and shallow-water habitats. Provide examples from the text to support your answer.",
    "Analyze the combined effects of exploitation and multi-decadal climate fluctuations on global fisheries yields. How do these factors make it difficult to assess the impacts of global climate change on fisheries yields? Use specific examples from the text to support your analysis.",
    "Based on the study by Gutiérrez-Rodríguez, A.G., et al., 2018, what potential benefits do seaweeds have in the field of medicine, specifically in relation to cancer treatment?",
    "According to the research conducted by Haasnoot, M., et al., 2020, how does the uncertainty in Antarctic mass-loss impact the coastal adaptation strategy of the Netherlands?",
    "Based on the context, explain how the decline in warm water coral reefs is projected to impact the services they provide to society, particularly in terms of coastal protection.",
    "Tell me something about the intricacies of tying a tie.",
]