Skip to content

Commit 66a443b

Browse files
committed
Evaluation
1 parent 065250c commit 66a443b

12 files changed

+417
-29
lines changed

docs/evaluation.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Evaluating the RAG answer quality
2+
3+
Install all the dependencies for the evaluation script by running the following command:
4+
5+
```bash
6+
pip install -r requirements-dev.txt
7+
```
8+
9+
## Generate ground truth data
10+
11+
Generate ground truth data by running the following command:
12+
13+
```bash
14+
python evals/generate.py
15+
```
16+
17+
Review the generated data after running that script, removing any question/answer pairs that don't seem like realistic user input.
18+
19+
## Evaluate the RAG answer quality
20+
21+
Review the configuration in `evals/eval_config.json` to ensure that everything is correctly setup. You may want to adjust the metrics used. [TODO: link to evaluator docs]
22+
23+
By default, the evaluation script will evaluate every question in the ground truth data.
24+
Run the evaluation script by running the following command:
25+
26+
```bash
27+
python evals/evaluate.py
28+
```
29+
30+
## Review the evaluation results
31+
32+
The evaluation script will output a summary of the evaluation results, inside the `evals/results` directory.
33+
34+
You can see a summary of results across all evaluation runs by running the following command:
35+
36+
```bash
37+
python -m evaltools summary evals/results
38+
```
39+
40+
Compare answers across runs by running the following command:
41+
42+
```bash
43+
python -m evaltools diff evals/results/baseline/
44+
```
45+
46+
## Run the evaluation in GitHub actions
47+
48+
49+
# TODO: Add GPT-4 deployment with high capacity for evaluation
50+
# TODO: Add CI workflow that can be triggered to run the evaluate on the local app

evals/README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,8 @@
1-
pip install git+https://github.com/Azure-Samples/ai-rag-chat-evaluator/@installable
2-
pip install psycopg2
1+
pip install -r requirements-dev.txt
2+
3+
python evals/generate.py
4+
5+
python evals/evaluate.py
6+
7+
# TODO: Add GPT-4 deployment with high capacity for evaluation
8+
# TODO: Add CI workflow that can be triggered to run the evaluate on the local app

evals/eval_config.json

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"testdata_path": "ground_truth.jsonl",
3+
"results_dir": "results/experiment<TIMESTAMP>",
4+
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citation_match"],
5+
"target_url": "http://host.docker.internal:8000/chat",
6+
"target_parameters": {
7+
"overrides": {
8+
"use_advanced_flow": true,
9+
"top": 3,
10+
"retrieval_mode": "hybrid",
11+
"temperature": 0.3
12+
}
13+
},
14+
"target_response_answer_jmespath": "message.content",
15+
"target_response_context_jmespath": "context.data_points"
16+
}

evals/evaluate.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import logging
2+
import os
3+
from pathlib import Path
4+
5+
from dotenv import load_dotenv
6+
from evaltools.eval.evaluate import run_evaluate_from_config
7+
from promptflow.core import AzureOpenAIModelConfiguration, ModelConfiguration, OpenAIModelConfiguration
8+
9+
logger = logging.getLogger("ragapp")
10+
11+
12+
def get_openai_config() -> ModelConfiguration:
13+
if os.environ.get("OPENAI_CHAT_HOST") == "azure":
14+
azure_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
15+
azure_deployment = os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT")
16+
api_version = "2023-07-01-preview"
17+
if os.environ.get("AZURE_OPENAI_KEY"):
18+
logger.info("Using Azure OpenAI Service with API Key from AZURE_OPENAI_KEY")
19+
openai_config = AzureOpenAIModelConfiguration(
20+
azure_endpoint=azure_endpoint,
21+
azure_deployment=azure_deployment,
22+
api_version=api_version,
23+
api_key=os.environ["AZURE_OPENAI_KEY"],
24+
)
25+
else:
26+
logger.info("Using Azure OpenAI Service with Azure Developer CLI Credential")
27+
openai_config = AzureOpenAIModelConfiguration(
28+
azure_endpoint=azure_endpoint, azure_deployment=azure_deployment, api_version=api_version
29+
)
30+
# PromptFlow will call DefaultAzureCredential behind the scenes
31+
openai_config.model = os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT"]
32+
else:
33+
logger.info("Using OpenAI Service with API Key from OPENAICOM_KEY")
34+
openai_config = OpenAIModelConfiguration(
35+
model=os.environ["OPENAICOM_CHAT_MODEL"], api_key=os.environ.get("OPENAICOM_KEY")
36+
)
37+
return openai_config
38+
39+
40+
if __name__ == "__main__":
41+
logging.basicConfig(level=logging.WARNING)
42+
logger.setLevel(logging.INFO)
43+
load_dotenv(".env", override=True)
44+
45+
openai_config = get_openai_config()
46+
run_evaluate_from_config(
47+
working_dir=Path(__file__).parent, config_path="eval_config.json", openai_config=openai_config, num_questions=20
48+
)

evals/generate_ground_truth.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,24 +27,24 @@ def source_retriever() -> Generator[dict, None, None]:
2727
item_types = session.scalars(select(Item.type).distinct())
2828
for item_type in item_types:
2929
records = list(session.scalars(select(Item).filter(Item.type == item_type).order_by(Item.id)))
30-
logger.info(f"Processing database records for type: {item_type}")
31-
yield {
32-
"citations": " ".join([f"[{record.id}] - {record.name}" for record in records]),
33-
"content": "\n\n".join([record.to_str_for_rag() for record in records]),
34-
}
30+
# logger.info(f"Processing database records for type: {item_type}")
31+
# yield {
32+
# "citations": " ".join([f"[{record.id}] - {record.name}" for record in records]),
33+
# "content": "\n\n".join([record.to_str_for_rag() for record in records]),
34+
# }
3535
# Fetch each item individually
36-
# records = session.scalars(select(Item).order_by(Item.id))
37-
# for record in records:
38-
# logger.info(f"Processing database record: {record.name}")
39-
# yield {"id": [record.id], "content": record.to_str_for_rag()}
36+
records = session.scalars(select(Item).order_by(Item.id))
37+
for record in records:
38+
logger.info(f"Processing database record: {record.name}")
39+
yield {"id": record.id, "content": record.to_str_for_rag()}
4040

4141

4242
def source_to_text(source) -> str:
4343
return source["content"]
4444

4545

4646
def answer_formatter(answer, source) -> str:
47-
return f"{answer} {source['citations']}"
47+
return f"{answer} [{source['id']}]"
4848

4949

5050
def get_openai_config_dict() -> dict:
@@ -85,11 +85,11 @@ def get_openai_config_dict() -> dict:
8585
load_dotenv(".env", override=True)
8686

8787
generate_test_qa_data(
88-
get_openai_config_dict(),
89-
10,
90-
5,
91-
Path(__file__).parent / "ground_truth.json",
92-
source_retriever,
93-
source_to_text,
94-
answer_formatter,
88+
openai_config=get_openai_config_dict(),
89+
num_questions_total=202,
90+
num_questions_per_source=2,
91+
output_file=Path(__file__).parent / "ground_truth.jsonl",
92+
source_retriever=source_retriever,
93+
source_to_text=source_to_text,
94+
answer_formatter=answer_formatter,
9595
)

evals/ground_truth.json

Lines changed: 0 additions & 10 deletions
This file was deleted.

0 commit comments

Comments
 (0)