from swebench.harness.run_evaluation import run_instances
import resource
import docker
from swebench.harness.docker_utils import list_images, clean_images
from swebench.harness.docker_build import build_env_images
from pathlib import Path
import json
import os
RUN_EVALUATION_LOG_DIR = Path("logs/run_evaluation")
LANGSMITH_EVALUATION_DIR = './langsmith_feedback/feedback.json'
def convert_runs_to_langsmith_feedback(
predictions: dict,
full_dataset: list,
run_id: str
) -> float:
"""
Convert logs from docker containers into LangSmith feedback.
Args:
predictions (dict): Predictions dict generated by the model
full_dataset (list): List of all instances
run_id (str): Run ID
"""
feedback_for_all_instances = {}
for instance in full_dataset:
feedback_for_instance = []
instance_id = instance['instance_id']
prediction = predictions[instance_id]
if prediction.get("model_patch", None) in ["", None]:
# Prediction returned an empty patch
feedback_for_all_instances[prediction['run_id']] = [
{"key": "non-empty-patch", "score": 0},
{"key": "completed-patch", "score": 0},
{"key": "resolved-patch", "score": 0}
]
continue
feedback_for_instance.append({"key": "non-empty-patch", "score": 1})
report_file = (
RUN_EVALUATION_LOG_DIR
/ run_id
/ prediction["model_name_or_path"].replace("/", "__")
/ prediction['instance_id']
/ "report.json"
)
if report_file.exists():
# If report file exists, then the instance has been run
feedback_for_instance.append({"key": "completed-patch", "score": 1})
report = json.loads(report_file.read_text())
# Check if instance actually resolved the PR
if report[instance_id]["resolved"]:
feedback_for_instance.append({"key": "resolved-patch", "score": 1})
else:
feedback_for_instance.append({"key": "resolved-patch", "score": 0})
else:
# The instance did not run successfully
feedback_for_instance += [
{"key": "completed-patch", "score": 0},
{"key": "resolved-patch", "score": 0}
]
feedback_for_all_instances[prediction['run_id']] = feedback_for_instance
os.makedirs(os.path.dirname(LANGSMITH_EVALUATION_DIR), exist_ok=True)
with open(LANGSMITH_EVALUATION_DIR, 'w') as json_file:
json.dump(feedback_for_all_instances, json_file)
def evaluate_predictions(
dataset: list,
predictions: list,
max_workers: int,
force_rebuild: bool,
cache_level: str,
clean: bool,
open_file_limit: int,
run_id: str,
timeout: int,
):
"""
Run evaluation harness for the given dataset and predictions.
"""
# set open file limit
assert len(run_id) > 0, "Run ID must be provided"
resource.setrlimit(resource.RLIMIT_NOFILE, (open_file_limit, open_file_limit))
client = docker.from_env()
existing_images = list_images(client)
print(f"Running {len(dataset)} unevaluated instances...")
# build environment images + run instances
build_env_images(client, dataset, force_rebuild, max_workers)
run_instances(predictions, dataset, cache_level, clean, force_rebuild, max_workers, run_id, timeout)
# clean images + make final report
clean_images(client, existing_images, cache_level, clean)
convert_runs_to_langsmith_feedback(predictions, dataset, run_id)