|
| 1 | +import logging |
| 2 | +import sys |
| 3 | +import os |
| 4 | +import pandas as pd |
| 5 | +from dotenv import load_dotenv |
| 6 | +from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Response |
| 7 | +from llama_index.core.evaluation import ( |
| 8 | + DatasetGenerator, |
| 9 | + RelevancyEvaluator, |
| 10 | + FaithfulnessEvaluator, |
| 11 | + EvaluationResult, |
| 12 | +) |
| 13 | +from llama_index.llms.openai import OpenAI |
| 14 | +from tabulate import tabulate |
| 15 | +import textwrap |
| 16 | +import argparse |
| 17 | +import traceback |
| 18 | +from httpx import ReadTimeout |
| 19 | + |
| 20 | +logging.basicConfig(stream=sys.stdout, level=logging.INFO) |
| 21 | +logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) |
| 22 | + |
| 23 | +parser = argparse.ArgumentParser( |
| 24 | + description="Process documents and questions for evaluation." |
| 25 | +) |
| 26 | +parser.add_argument( |
| 27 | + "--num_documents", |
| 28 | + type=int, |
| 29 | + default=None, |
| 30 | + help="Number of documents to process (default: all)", |
| 31 | +) |
| 32 | +parser.add_argument( |
| 33 | + "--skip_documents", |
| 34 | + type=int, |
| 35 | + default=0, |
| 36 | + help="Number of documents to skip at the beginning (default: 0)", |
| 37 | +) |
| 38 | +parser.add_argument( |
| 39 | + "--num_questions", |
| 40 | + type=int, |
| 41 | + default=None, |
| 42 | + help="Number of questions to process (default: all)", |
| 43 | +) |
| 44 | +parser.add_argument( |
| 45 | + "--skip_questions", |
| 46 | + type=int, |
| 47 | + default=0, |
| 48 | + help="Number of questions to skip at the beginning (default: 0)", |
| 49 | +) |
| 50 | +parser.add_argument( |
| 51 | + "--process_last_questions", |
| 52 | + action="store_true", |
| 53 | + help="Process last N questions instead of first N", |
| 54 | +) |
| 55 | +args = parser.parse_args() |
| 56 | + |
| 57 | +load_dotenv(".env") |
| 58 | + |
| 59 | +reader = SimpleDirectoryReader("/tmp/elastic/production-readiness-review") |
| 60 | +documents = reader.load_data() |
| 61 | +print(f"First document: {documents[0].text}") |
| 62 | +print(f"Second document: {documents[1].text}") |
| 63 | +print(f"Thrid document: {documents[2].text}") |
| 64 | + |
| 65 | + |
| 66 | +if args.skip_documents > 0: |
| 67 | + documents = documents[args.skip_documents :] |
| 68 | + |
| 69 | +if args.num_documents is not None: |
| 70 | + documents = documents[: args.num_documents] |
| 71 | + |
| 72 | +print(f"Number of documents loaded: {len(documents)}") |
| 73 | + |
| 74 | +llm = OpenAI(model="gpt-4o", request_timeout=120) |
| 75 | + |
| 76 | +data_generator = DatasetGenerator.from_documents(documents, llm=llm) |
| 77 | + |
| 78 | +try: |
| 79 | + eval_questions = data_generator.generate_questions_from_nodes() |
| 80 | + if isinstance(eval_questions, str): |
| 81 | + eval_questions_list = eval_questions.strip().split("\n") |
| 82 | + else: |
| 83 | + eval_questions_list = eval_questions |
| 84 | + eval_questions_list = [q for q in eval_questions_list if q.strip()] |
| 85 | + |
| 86 | + if args.skip_questions > 0: |
| 87 | + eval_questions_list = eval_questions_list[args.skip_questions :] |
| 88 | + |
| 89 | + if args.num_questions is not None: |
| 90 | + if args.process_last_questions: |
| 91 | + eval_questions_list = eval_questions_list[-args.num_questions :] |
| 92 | + else: |
| 93 | + eval_questions_list = eval_questions_list[: args.num_questions] |
| 94 | + |
| 95 | + print("\All available questions generated:") |
| 96 | + for idx, q in enumerate(eval_questions): |
| 97 | + print(f"{idx}. {q}") |
| 98 | + |
| 99 | + print("\nGenerated questions:") |
| 100 | + for idx, q in enumerate(eval_questions_list, start=1): |
| 101 | + print(f"{idx}. {q}") |
| 102 | +except ReadTimeout as e: |
| 103 | + print( |
| 104 | + "Request to Ollama timed out during question generation. Please check the server or increase the timeout duration." |
| 105 | + ) |
| 106 | + traceback.print_exc() |
| 107 | + sys.exit(1) |
| 108 | +except Exception as e: |
| 109 | + print(f"An error occurred while generating questions: {e}") |
| 110 | + traceback.print_exc() |
| 111 | + sys.exit(1) |
| 112 | + |
| 113 | +print(f"\nTotal number of questions generated: {len(eval_questions_list)}") |
| 114 | + |
| 115 | +evaluator_relevancy = RelevancyEvaluator(llm=llm) |
| 116 | +evaluator_faith = FaithfulnessEvaluator(llm=llm) |
| 117 | + |
| 118 | +vector_index = VectorStoreIndex.from_documents(documents) |
| 119 | + |
| 120 | + |
| 121 | +def display_eval_df( |
| 122 | + query: str, |
| 123 | + response: Response, |
| 124 | + eval_result_relevancy: EvaluationResult, |
| 125 | + eval_result_faith: EvaluationResult, |
| 126 | +) -> None: |
| 127 | + relevancy_feedback = getattr(eval_result_relevancy, "feedback", "") |
| 128 | + relevancy_passing = getattr(eval_result_relevancy, "passing", False) |
| 129 | + relevancy_passing_str = "Pass" if relevancy_passing else "Fail" |
| 130 | + |
| 131 | + relevancy_score = 1.0 if relevancy_passing else 0.0 |
| 132 | + |
| 133 | + faithfulness_feedback = getattr(eval_result_faith, "feedback", "") |
| 134 | + faithfulness_passing_bool = getattr(eval_result_faith, "passing", False) |
| 135 | + faithfulness_passing = "Pass" if faithfulness_passing_bool else "Fail" |
| 136 | + |
| 137 | + def wrap_text(text, width=50): |
| 138 | + if text is None: |
| 139 | + return "" |
| 140 | + text = str(text) |
| 141 | + text = text.replace("\r", "") |
| 142 | + lines = text.split("\n") |
| 143 | + wrapped_lines = [] |
| 144 | + for line in lines: |
| 145 | + wrapped_lines.extend(textwrap.wrap(line, width=width)) |
| 146 | + wrapped_lines.append("") |
| 147 | + return "\n".join(wrapped_lines) |
| 148 | + |
| 149 | + if response.source_nodes: |
| 150 | + source_content = wrap_text(response.source_nodes[0].node.get_content()) |
| 151 | + else: |
| 152 | + source_content = "" |
| 153 | + |
| 154 | + eval_data = { |
| 155 | + "Query": wrap_text(query), |
| 156 | + "Response": wrap_text(str(response)), |
| 157 | + "Source": source_content, |
| 158 | + "Relevancy Response": relevancy_passing_str, |
| 159 | + "Relevancy Feedback": wrap_text(relevancy_feedback), |
| 160 | + "Relevancy Score": wrap_text(str(relevancy_score)), |
| 161 | + "Faith Response": faithfulness_passing, |
| 162 | + "Faith Feedback": wrap_text(faithfulness_feedback), |
| 163 | + } |
| 164 | + |
| 165 | + eval_df = pd.DataFrame([eval_data]) |
| 166 | + |
| 167 | + print("\nEvaluation Result:") |
| 168 | + print( |
| 169 | + tabulate( |
| 170 | + eval_df, headers="keys", tablefmt="grid", showindex=False, stralign="left" |
| 171 | + ) |
| 172 | + ) |
| 173 | + |
| 174 | + |
| 175 | +query_engine = vector_index.as_query_engine(llm=llm) |
| 176 | + |
| 177 | +total_questions = len(eval_questions_list) |
| 178 | +for idx, question in enumerate(eval_questions_list, start=1): |
| 179 | + try: |
| 180 | + response_vector = query_engine.query(question) |
| 181 | + eval_result_relevancy = evaluator_relevancy.evaluate_response( |
| 182 | + query=question, response=response_vector |
| 183 | + ) |
| 184 | + eval_result_faith = evaluator_faith.evaluate_response(response=response_vector) |
| 185 | + |
| 186 | + print(f"\nProcessing Question {idx} of {total_questions}:") |
| 187 | + display_eval_df( |
| 188 | + question, response_vector, eval_result_relevancy, eval_result_faith |
| 189 | + ) |
| 190 | + except ReadTimeout as e: |
| 191 | + print(f"Request to OpenAI timed out while processing question {idx}.") |
| 192 | + traceback.print_exc() |
| 193 | + continue |
| 194 | + except Exception as e: |
| 195 | + print(f"An error occurred while processing question {idx}: {e}") |
| 196 | + traceback.print_exc() |
| 197 | + continue |
0 commit comments