Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| from pathlib import Path | |
| from src.models import FullEvalResult | |
| from src.read_evals import load_raw_eval_results | |
| from src.utils import get_leaderboard_df | |
| cur_fp = Path(__file__) | |
| def test_init_from_json_file(): | |
| json_fp = cur_fp.parents[2] / "toydata" / "test_data.json" | |
| full_eval_result = FullEvalResult.init_from_json_file(json_fp) | |
| num_different_task_domain_lang_metric_dataset_combination = 6 | |
| assert len(full_eval_result.results) == num_different_task_domain_lang_metric_dataset_combination | |
| assert full_eval_result.retrieval_model == "bge-m3" | |
| assert full_eval_result.reranking_model == "bge-reranker-v2-m3" | |
| def test_to_dict(): | |
| json_fp = cur_fp.parents[2] / "toydata" / "test_data.json" | |
| full_eval_result = FullEvalResult.init_from_json_file(json_fp) | |
| result_list = full_eval_result.to_dict(task="qa", metric="ndcg_at_1") | |
| assert len(result_list) == 1 | |
| result_dict = result_list[0] | |
| assert result_dict["Retrieval Model"] == "bge-m3" | |
| assert result_dict["Reranking Model"] == "bge-reranker-v2-m3" | |
| assert result_dict["wiki_en"] is not None | |
| assert result_dict["wiki_zh"] is not None | |
| def test_get_raw_eval_results(): | |
| results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04" | |
| results = load_raw_eval_results(results_path) | |
| # only load the latest results | |
| assert len(results) == 4 | |
| assert results[0].eval_name == "bge-base-en-v1.5_NoReranker" | |
| assert len(results[0].results) == 70 | |
| assert results[0].eval_name == "bge-base-en-v1.5_bge-reranker-v2-m3" | |
| assert len(results[1].results) == 70 | |
| def test_get_leaderboard_df(): | |
| results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04" | |
| raw_data = load_raw_eval_results(results_path) | |
| df = get_leaderboard_df(raw_data, "qa", "ndcg_at_10") | |
| assert df.shape[0] == 4 | |
| # the results contain only one embedding model | |
| # for i in range(4): | |
| # assert df["Retrieval Model"][i] == "bge-m3" | |
| # # the results contain only two reranking model | |
| # assert df["Reranking Model"][0] == "bge-reranker-v2-m3" | |
| # assert df["Reranking Model"][1] == "NoReranker" | |
| # assert df["Average ⬆️"][0] > df["Average ⬆️"][1] | |
| # assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh', ]].isnull().values.any() | |
| def test_get_leaderboard_df_long_doc(): | |
| results_path = cur_fp.parents[2] / "toydata" / "test_results" | |
| raw_data = load_raw_eval_results(results_path) | |
| df = get_leaderboard_df(raw_data, "long-doc", "ndcg_at_1") | |
| assert df.shape[0] == 2 | |
| # the results contain only one embedding model | |
| for i in range(2): | |
| assert df["Retrieval Model"][i] == "bge-m3" | |
| # the results contains only two reranking model | |
| assert df["Reranking Model"][0] == "bge-reranker-v2-m3" | |
| assert df["Reranking Model"][1] == "NoReranker" | |
| assert df["Average ⬆️"][0] > df["Average ⬆️"][1] | |
| assert ( | |
| not df[ | |
| [ | |
| "Average ⬆️", | |
| "law_en_lex_files_500k_600k", | |
| ] | |
| ] | |
| .isnull() | |
| .values.any() | |
| ) | |