Skip to content

Commit 72359fb

Browse files
authored
Merge pull request #13 from e06084/main
feat: add evaluate_batched
2 parents 6ace3ce + 426c218 commit 72359fb

10 files changed

Lines changed: 708 additions & 457 deletions

data/sample_dataset.jsonl

Lines changed: 4 additions & 4 deletions
Large diffs are not rendered by default.

examples/basic_usage.py

Lines changed: 41 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -706,29 +706,54 @@ def demo_dataset_with_extraction():
706706
from webmainbench import DataLoader, DataSaver, Evaluator, ExtractorFactory
707707
from pathlib import Path
708708

709-
# 从文件加载数据集
709+
# 配置文件路径
710710
data_dir = Path("data")
711711
dataset_path = data_dir / "sample_dataset.jsonl"
712712
# dataset_path = "/Users/chupei/Downloads/WebMainBench_dataset_merge_2549.jsonl"
713713

714-
print(f"📂 从文件加载数据集: {dataset_path}")
715-
dataset = DataLoader.load_jsonl(dataset_path, include_results=False)
716-
dataset.name = "WebMainBench_with_extraction"
717-
dataset.description = "演示抽取内容保存的测试数据集"
714+
print(f"📂 数据集文件: {dataset_path}")
718715

719-
print(f"📊 加载数据集完成,包含 {len(dataset.samples)} 个样本")
716+
# 🔧 创建llm-webkit抽取器(统一使用)
717+
extractor_config = {"model_path": "/Users/chupei/model/checkpoint-3296"}
718+
extractor = ExtractorFactory.create("llm-webkit", config=extractor_config)
719+
print(f"🤖 使用抽取器: {extractor.name}")
720720

721-
# 创建抽取器并运行评测
722-
try:
723-
extractor = ExtractorFactory.create("llm-webkit", config={"model_path": "/Users/chupei/model/checkpoint-3296"})
724-
print(f"🤖 使用抽取器: {extractor.name}")
725-
except Exception as e:
726-
print(f"⚠️ LLM-WebKit抽取器创建失败,使用mock抽取器: {e}")
727-
extractor = ExtractorFactory.create("mock")
728-
729-
# 运行评测
721+
# 创建评测器
730722
evaluator = Evaluator()
731-
result = evaluator.evaluate(dataset, extractor)
723+
724+
# 🔧 选择评测模式:内存模式 vs 批处理模式
725+
USE_BATCHED_MODE = True # 设置为True使用批处理模式(适用于大数据集)
726+
727+
if USE_BATCHED_MODE:
728+
print("🔄 使用批处理模式(内存优化)")
729+
730+
# 🚀 批处理评测(适用于大数据集)
731+
result = evaluator.evaluate_batched(
732+
jsonl_file_path=dataset_path,
733+
extractor=extractor, # 直接传递extractor对象
734+
batch_size=10, # 小批次
735+
max_samples=20 # 演示用
736+
)
737+
print(f"✅ 批处理评测完成,总体得分: {result.overall_metrics.get('overall', 0):.4f}")
738+
739+
# 为了保存带有抽取内容的数据集,需要重新加载原始数据集
740+
# 注:这里只是短暂加载用于保存,不影响前面的内存优化评测
741+
dataset = DataLoader.load_jsonl(dataset_path, include_results=False)
742+
dataset.name = result.dataset_name
743+
744+
else:
745+
print("🔄 使用传统内存模式")
746+
747+
# 从文件加载数据集
748+
print(f"📂 从文件加载数据集: {dataset_path}")
749+
dataset = DataLoader.load_jsonl(dataset_path, include_results=False)
750+
dataset.name = "WebMainBench_with_extraction"
751+
dataset.description = "演示抽取内容保存的测试数据集"
752+
753+
print(f"📊 加载数据集完成,包含 {len(dataset.samples)} 个样本")
754+
755+
# 运行评测
756+
result = evaluator.evaluate(dataset, extractor)
732757

733758
print(f"✅ 评测完成,总体得分: {result.overall_metrics.get('overall', 0):.4f}")
734759

0 commit comments

Comments
 (0)