from openevals.simulators import run_multiturn_simulation, create_llm_simulated_user
from openevals.llm import create_llm_as_judge
from openevals.types import ChatCompletionMessage
from langsmith.wrappers import wrap_openai
from langsmith import Client
from openai import OpenAI
ls_client = Client()
examples = [
{
"inputs": {
"messages": [{ "role": "user", "content": "I want a refund for my car!" }],
"simulated_user_prompt": "You are an angry and belligerent customer who wants a refund for their car."
},
},
{
"inputs": {
"messages": [{ "role": "user", "content": "Please give me a refund for my car." }],
"simulated_user_prompt": "You are a nice customer who wants a refund for their car.",
},
}
]
dataset = ls_client.create_dataset(dataset_name="multiturn-with-personas")
ls_client.create_examples(
dataset_id=dataset.id,
examples=examples,
)
trajectory_evaluator = create_llm_as_judge(
model="openai:o3-mini",
prompt="Based on the below conversation, was the user satisfied?\n{outputs}",
feedback_key="satisfaction",
)
def target(inputs: dict):
# Wrap OpenAI client for tracing
client = wrap_openai(OpenAI())
history = {}
def app(next_message: ChatCompletionMessage, *, thread_id: str):
if thread_id not in history:
history[thread_id] = []
history[thread_id] = history[thread_id] + [next_message]
res = client.chat.completions.create(
model="gpt-4.1-nano",
messages=[
{
"role": "system",
"content": "You are a patient and understanding customer service agent.",
}
]
+ history[thread_id],
)
response = res.choices[0].message
history[thread_id].append(response)
return response
user = create_llm_simulated_user(
system=inputs["simulated_user_prompt"],
model="openai:gpt-4.1-nano",
fixed_responses=inputs["messages"],
)
res = run_multiturn_simulation(
app=app,
user=user,
max_turns=5,
)
return res["trajectory"]
results = ls_client.evaluate(
target,
data=dataset.name,
evaluators=[trajectory_evaluator],
)