feat: ragas improved testing (#2611)

# Description

Please include a summary of the changes and the related issue. Please
also include relevant motivation and context.

## Checklist before requesting a review

Please delete options that are not relevant.

- [ ] My code follows the style guidelines of this project
- [ ] I have performed a self-review of my code
- [ ] I have commented hard-to-understand areas
- [ ] I have ideally added tests that prove my fix is effective or that
my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged

## Screenshots (if appropriate):
This commit is contained in:
Stan Girard 2024-05-23 10:36:10 +02:00 committed by GitHub
parent c287fceb5b
commit b1fbbc8b02
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 74 additions and 4 deletions

View File

@ -329,7 +329,7 @@ class QuivrRAG(BaseModel):
model=self.model,
temperature=self.temperature,
api_base=api_base,
)
) # pyright: ignore reportPrivateUsage=none
if self.model_compatible_with_function_calling():
# And finally, we do the part that returns the answers

View File

@ -1,4 +1,5 @@
import argparse
import json
import os
import sys
@ -78,6 +79,7 @@ def main(
score.to_json(output_folder + "/score.json", orient="records")
for metric in metrics:
print(f"{metric} scores: {score[metric]}")
print(f"{metric} mean score: {score[metric].mean()}")
print(f"{metric} median score: {score[metric].median()}")
# Cleanup if a new brain was created
@ -143,7 +145,14 @@ def generate_replies(
for question in test_questions:
response = brain_chain.invoke({"question": question, "chat_history": []})
answers.append(response["answer"].content)
cited_answer_data = response["answer"].additional_kwargs["tool_calls"][0][
"function"
]["arguments"]
cited_answer_obj = json.loads(cited_answer_data)
print(f"Answer: {cited_answer_obj['answer']}")
answers.append(cited_answer_obj["answer"])
print(f"Context: {cited_answer_obj}")
print(response)
contexts.append([context.page_content for context in response["docs"]])
return Dataset.from_dict(
@ -189,7 +198,7 @@ if __name__ == "__main__":
"faithfulness",
"answer_similarity",
],
default=["answer_correctness"],
default=["answer_similarity"],
help="Metrics to evaluate",
)
parser.add_argument(

File diff suppressed because one or more lines are too long