Azure OpenAI Llama Index Quickstartยถ
In this quickstart you will create a simple Llama Index App and learn how to log it and get feedback on an LLM response using both an embedding and chat completion model from Azure OpenAI.
Inย [ย ]:
Copied!
# !pip install trulens trulens-apps-llamaindex trulens-providers-openai llama_index==0.9.13 llama-index-llms-azure-openai llama-index-embeddings-azure-openai langchain==0.0.346 html2text==2020.1.16
# !pip install trulens trulens-apps-llamaindex trulens-providers-openai llama_index==0.9.13 llama-index-llms-azure-openai llama-index-embeddings-azure-openai langchain==0.0.346 html2text==2020.1.16
Add API keysยถ
For this quickstart, you will need a larger set of information from Azure OpenAI compared to typical OpenAI usage. These can be retrieved from https://oai.azure.com/ . Deployment name below is also found on the oai azure page.
Inย [ย ]:
Copied!
# Check your https://oai.azure.com dashboard to retrieve params:
import os
os.environ["AZURE_OPENAI_API_KEY"] = "..." # azure
os.environ["AZURE_OPENAI_ENDPOINT"] = (
"https://<your endpoint here>.openai.azure.com/" # azure
)
os.environ["OPENAI_API_VERSION"] = "2023-07-01-preview" # may need updating
os.environ["OPENAI_API_TYPE"] = "azure"
# Check your https://oai.azure.com dashboard to retrieve params:
import os
os.environ["AZURE_OPENAI_API_KEY"] = "..." # azure
os.environ["AZURE_OPENAI_ENDPOINT"] = (
"https://.openai.azure.com/" # azure
)
os.environ["OPENAI_API_VERSION"] = "2023-07-01-preview" # may need updating
os.environ["OPENAI_API_TYPE"] = "azure"
Import from TruLensยถ
Inย [ย ]:
Copied!
# Imports main tools:
from trulens.core import Feedback
from trulens.core import TruSession
from trulens.apps.llamaindex import TruLlama
session = TruSession()
session.reset_database()
# Imports main tools:
from trulens.core import Feedback
from trulens.core import TruSession
from trulens.apps.llamaindex import TruLlama
session = TruSession()
session.reset_database()
Create Simple LLM Applicationยถ
This example uses LlamaIndex which internally uses an OpenAI LLM.
Inย [ย ]:
Copied!
import os
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.legacy import ServiceContext
from llama_index.legacy import set_global_service_context
from llama_index.legacy.readers import SimpleWebPageReader
from llama_index.llms.azure_openai import AzureOpenAI
# get model from Azure
llm = AzureOpenAI(
model="gpt-35-turbo",
deployment_name="<your deployment>",
api_key=os.environ["AZURE_OPENAI_API_KEY"],
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
api_version=os.environ["OPENAI_API_VERSION"],
)
# You need to deploy your own embedding model as well as your own chat completion model
embed_model = AzureOpenAIEmbedding(
model="text-embedding-ada-002",
deployment_name="<your deployment>",
api_key=os.environ["AZURE_OPENAI_API_KEY"],
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
api_version=os.environ["OPENAI_API_VERSION"],
)
documents = SimpleWebPageReader(html_to_text=True).load_data(
["http://paulgraham.com/worked.html"]
)
service_context = ServiceContext.from_defaults(
llm=llm,
embed_model=embed_model,
)
set_global_service_context(service_context)
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
import os
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.legacy import ServiceContext
from llama_index.legacy import set_global_service_context
from llama_index.legacy.readers import SimpleWebPageReader
from llama_index.llms.azure_openai import AzureOpenAI
# get model from Azure
llm = AzureOpenAI(
model="gpt-35-turbo",
deployment_name="",
api_key=os.environ["AZURE_OPENAI_API_KEY"],
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
api_version=os.environ["OPENAI_API_VERSION"],
)
# You need to deploy your own embedding model as well as your own chat completion model
embed_model = AzureOpenAIEmbedding(
model="text-embedding-ada-002",
deployment_name="",
api_key=os.environ["AZURE_OPENAI_API_KEY"],
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
api_version=os.environ["OPENAI_API_VERSION"],
)
documents = SimpleWebPageReader(html_to_text=True).load_data(
["http://paulgraham.com/worked.html"]
)
service_context = ServiceContext.from_defaults(
llm=llm,
embed_model=embed_model,
)
set_global_service_context(service_context)
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
Send your first requestยถ
Inย [ย ]:
Copied!
query = "What is most interesting about this essay?"
answer = query_engine.query(query)
print(answer.get_formatted_sources())
print("query was:", query)
print("answer was:", answer)
query = "What is most interesting about this essay?"
answer = query_engine.query(query)
print(answer.get_formatted_sources())
print("query was:", query)
print("answer was:", answer)
Initialize Feedback Function(s)ยถ
Inย [ย ]:
Copied!
import numpy as np
from trulens.feedback.v2.feedback import Groundedness
from trulens.providers.openai import AzureOpenAI
# Initialize AzureOpenAI-based feedback function collection class:
azopenai = AzureOpenAI(deployment_name="truera-gpt-35-turbo")
# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(
azopenai.relevance, name="Answer Relevance"
).on_input_output()
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
Feedback(
azopenai.context_relevance_with_cot_reasons, name="Context Relevance"
)
.on_input()
.on(TruLlama.select_source_nodes().node.text)
.aggregate(np.mean)
)
# groundedness of output on the context
groundedness = Groundedness(groundedness_provider=azopenai)
f_groundedness = (
Feedback(
groundedness.groundedness_measure_with_cot_reasons, name="Groundedness"
)
.on(TruLlama.select_source_nodes().node.text.collect())
.on_output()
.aggregate(groundedness.grounded_statements_aggregator)
)
import numpy as np
from trulens.feedback.v2.feedback import Groundedness
from trulens.providers.openai import AzureOpenAI
# Initialize AzureOpenAI-based feedback function collection class:
azopenai = AzureOpenAI(deployment_name="truera-gpt-35-turbo")
# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(
azopenai.relevance, name="Answer Relevance"
).on_input_output()
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
Feedback(
azopenai.context_relevance_with_cot_reasons, name="Context Relevance"
)
.on_input()
.on(TruLlama.select_source_nodes().node.text)
.aggregate(np.mean)
)
# groundedness of output on the context
groundedness = Groundedness(groundedness_provider=azopenai)
f_groundedness = (
Feedback(
groundedness.groundedness_measure_with_cot_reasons, name="Groundedness"
)
.on(TruLlama.select_source_nodes().node.text.collect())
.on_output()
.aggregate(groundedness.grounded_statements_aggregator)
)
Custom functions can also use the Azure providerยถ
Inย [ย ]:
Copied!
from typing import Dict, Tuple
from trulens.feedback import prompts
class Custom_AzureOpenAI(AzureOpenAI):
def style_check_professional(self, response: str) -> float:
"""
Custom feedback function to grade the professional style of the response, extending AzureOpenAI provider.
Args:
response (str): text to be graded for professional style.
Returns:
float: A value between 0 and 1. 0 being "not professional" and 1 being "professional".
"""
professional_prompt = str.format(
"Please rate the professionalism of the following text on a scale from 0 to 10, where 0 is not at all professional and 10 is extremely professional: \n\n{}",
response,
)
return self.generate_score(system_prompt=professional_prompt)
def context_relevance_with_cot_reasons_extreme(
self, question: str, statement: str
) -> Tuple[float, Dict]:
"""
Tweaked version of question statement relevance, extending AzureOpenAI provider.
A function that completes a template to check the relevance of the statement to the question.
Scoring guidelines for scores 5-8 are removed to push the LLM to more extreme scores.
Also uses chain of thought methodology and emits the reasons.
Args:
question (str): A question being asked.
statement (str): A statement to the question.
Returns:
float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant".
"""
system_prompt = str.format(
prompts.context_relevance, question=question, statement=statement
)
# remove scoring guidelines around middle scores
system_prompt = system_prompt.replace(
"- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.\n\n",
"",
)
system_prompt = system_prompt.replace(
"RELEVANCE:", prompts.COT_REASONS_TEMPLATE
)
return self.generate_score_and_reasons(system_prompt)
custom_azopenai = Custom_AzureOpenAI(deployment_name="truera-gpt-35-turbo")
# Question/statement relevance between question and each context chunk.
f_context_relevance_extreme = (
Feedback(
custom_azopenai.context_relevance_with_cot_reasons_extreme,
name="Context Relevance - Extreme",
)
.on_input()
.on(TruLlama.select_source_nodes().node.text)
.aggregate(np.mean)
)
f_style_check = Feedback(
custom_azopenai.style_check_professional, name="Professional Style"
).on_output()
from typing import Dict, Tuple
from trulens.feedback import prompts
class Custom_AzureOpenAI(AzureOpenAI):
def style_check_professional(self, response: str) -> float:
"""
Custom feedback function to grade the professional style of the response, extending AzureOpenAI provider.
Args:
response (str): text to be graded for professional style.
Returns:
float: A value between 0 and 1. 0 being "not professional" and 1 being "professional".
"""
professional_prompt = str.format(
"Please rate the professionalism of the following text on a scale from 0 to 10, where 0 is not at all professional and 10 is extremely professional: \n\n{}",
response,
)
return self.generate_score(system_prompt=professional_prompt)
def context_relevance_with_cot_reasons_extreme(
self, question: str, statement: str
) -> Tuple[float, Dict]:
"""
Tweaked version of question statement relevance, extending AzureOpenAI provider.
A function that completes a template to check the relevance of the statement to the question.
Scoring guidelines for scores 5-8 are removed to push the LLM to more extreme scores.
Also uses chain of thought methodology and emits the reasons.
Args:
question (str): A question being asked.
statement (str): A statement to the question.
Returns:
float: A value between 0 and 1. 0 being "not relevant" and 1 being "relevant".
"""
system_prompt = str.format(
prompts.context_relevance, question=question, statement=statement
)
# remove scoring guidelines around middle scores
system_prompt = system_prompt.replace(
"- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.\n\n",
"",
)
system_prompt = system_prompt.replace(
"RELEVANCE:", prompts.COT_REASONS_TEMPLATE
)
return self.generate_score_and_reasons(system_prompt)
custom_azopenai = Custom_AzureOpenAI(deployment_name="truera-gpt-35-turbo")
# Question/statement relevance between question and each context chunk.
f_context_relevance_extreme = (
Feedback(
custom_azopenai.context_relevance_with_cot_reasons_extreme,
name="Context Relevance - Extreme",
)
.on_input()
.on(TruLlama.select_source_nodes().node.text)
.aggregate(np.mean)
)
f_style_check = Feedback(
custom_azopenai.style_check_professional, name="Professional Style"
).on_output()
Instrument chain for logging with TruLensยถ
Inย [ย ]:
Copied!
tru_query_engine_recorder = TruLlama(
query_engine,
app_name="LlamaIndex_App1_AzureOpenAI",
feedbacks=[
f_groundedness,
f_qa_relevance,
f_context_relevance,
f_context_relevance_extreme,
f_style_check,
],
)
tru_query_engine_recorder = TruLlama(
query_engine,
app_name="LlamaIndex_App1_AzureOpenAI",
feedbacks=[
f_groundedness,
f_qa_relevance,
f_context_relevance,
f_context_relevance_extreme,
f_style_check,
],
)
Inย [ย ]:
Copied!
query = "What is most interesting about this essay?"
with tru_query_engine_recorder as recording:
answer = query_engine.query(query)
print(answer.get_formatted_sources())
print("query was:", query)
print("answer was:", answer)
query = "What is most interesting about this essay?"
with tru_query_engine_recorder as recording:
answer = query_engine.query(query)
print(answer.get_formatted_sources())
print("query was:", query)
print("answer was:", answer)
Explore in a Dashboardยถ
Inย [ย ]:
Copied!
from trulens.dashboard import run_dashboard
run_dashboard(session) # open a local streamlit app to explore
# stop_dashboard(session) # stop if needed
from trulens.dashboard import run_dashboard
run_dashboard(session) # open a local streamlit app to explore
# stop_dashboard(session) # stop if needed
Or view results directly in your notebookยถ
Inย [ย ]:
Copied!
records, feedback = session.get_records_and_feedback(
app_ids=[tru_query_engine_recorder.app_id]
)
records
records, feedback = session.get_records_and_feedback(
app_ids=[tru_query_engine_recorder.app_id]
)
records
Inย [ย ]:
Copied!
session.get_leaderboard(app_ids=[tru_query_engine_recorder.app_id])
session.get_leaderboard(app_ids=[tru_query_engine_recorder.app_id])