Skip to content

Commit ddae69a

Browse files
committed
add extractor version in results
1 parent c791758 commit ddae69a

2 files changed

Lines changed: 23 additions & 15 deletions

File tree

webmainbench/data/saver.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -126,32 +126,45 @@ def save_summary_report(results: Union["EvaluationResult", List["EvaluationResul
126126
file_path: Output CSV file path
127127
"""
128128
import csv
129-
129+
from importlib import metadata as importlib_metadata
130+
130131
file_path = Path(file_path)
131132
file_path.parent.mkdir(parents=True, exist_ok=True)
132-
133-
# Convert EvaluationResult objects to dicts and ensure we have a list
133+
134+
# 转换结果为字典列表
134135
def to_dict_if_needed(item):
135136
return item.to_dict() if hasattr(item, 'to_dict') else item
136-
137+
137138
if isinstance(results, list):
138139
results_list = [to_dict_if_needed(item) for item in results]
139140
else:
140141
results_list = [to_dict_if_needed(results)]
141-
142-
# Prepare CSV data
142+
143143
csv_data = []
144-
145144
for result in results_list:
146-
# Extract basic info
147145
metadata = result.get('metadata', {})
148146
error_analysis = result.get('error_analysis', {})
147+
148+
# 获取抽取器版本
149+
extractor_name = metadata.get('extractor_name', 'unknown')
150+
try:
151+
# 映射抽取器名称到包名
152+
package_mapping = {
153+
'llm-webkit': 'llm_web_kit',
154+
'magic-html': 'magic_html',
155+
'trafilatura': 'trafilatura',
156+
'resiliparse': 'resiliparse'
157+
}
158+
package_name = package_mapping.get(extractor_name, extractor_name)
159+
extractor_version = importlib_metadata.version(package_name)
160+
except importlib_metadata.PackageNotFoundError:
161+
extractor_version = 'unknown'
149162
row = {
150163
'extractor': metadata.get('extractor_name', 'unknown'),
151164
'dataset': metadata.get('dataset_name', 'unknown'),
152165
'total_samples': metadata.get('total_samples', 0),
153166
'success_rate': error_analysis.get('success_rate', 0.0),
154-
'extractor_version': metadata.get('version', 'unknown')
167+
'extractor_version': extractor_version,
155168
}
156169

157170
# Add all available metrics from overall_metrics
@@ -170,7 +183,7 @@ def get_sort_key(row):
170183
# Write CSV file
171184
if csv_data:
172185
# Define field order: basic info first, then overall, then other metrics alphabetically
173-
basic_fields = ['extractor', 'dataset', 'total_samples', 'success_rate','extractor_version']
186+
basic_fields = ['extractor','extractor_version', 'dataset', 'total_samples', 'success_rate']
174187

175188
# Get all metric fields from the data
176189
all_fields = set()

webmainbench/evaluator/evaluator.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ class EvaluationResult:
2323
extractor_name: str
2424
timestamp: str
2525
total_samples: int
26-
version: str # 抽取器版本
2726

2827
# Overall metrics
2928
overall_metrics: Dict[str, float]
@@ -49,7 +48,6 @@ def to_dict(self) -> Dict[str, Any]:
4948
"extractor_name": self.extractor_name,
5049
"timestamp": self.timestamp,
5150
"total_samples": self.total_samples,
52-
"version": self.version
5351
},
5452
"overall_metrics": self.overall_metrics,
5553
"sample_results": self.sample_results,
@@ -74,7 +72,6 @@ def from_dict(cls, data: Dict[str, Any]) -> "EvaluationResult":
7472
error_analysis=data.get("error_analysis"),
7573
extractor_config=data.get("extractor_config"),
7674
metric_config=data.get("metric_config"),
77-
version=metadata.get("version", "unknown"),
7875
)
7976

8077

@@ -185,8 +182,6 @@ def evaluate(self,
185182
error_analysis=error_analysis,
186183
extractor_config=extractor.get_config(),
187184
metric_config=self.metric_config,
188-
# 新增:传入抽取器版本(从extractor对象获取)
189-
version=getattr(extractor, 'version', 'unknown'),
190185
)
191186

192187
return evaluation_result

0 commit comments

Comments
 (0)