Few-shot Benchmark Report

Run ID: 20260511T181245Z | Generated UTC: 2026-05-11T18:12:58.112396+00:00

Configuration

{
  "models": [
    "sonnet-4-6"
  ],
  "fewshot_strategies": [
    "raw_only"
  ],
  "chat_count": 20,
  "runs_per_chat": 1,
  "max_workers": 15,
  "raw_fewshot_count": 4,
  "raw_fewshot_labels_filter": [
    "[chats] multiple_product_multiple_shipment_complex.json",
    "[chats] single_product_multiple_shipment_complex.json",
    "[chats] single_product_single_shipment_complex.json",
    "[updates] update_change_quantity.json",
    "[updates] update_change_unit_price.json"
  ]
}

Model + Strategy Summary

ModelStrategyRunsSuccess rateAvg attempts Avg elapsed (s)Avg mismatch/expected runField match rate
sonnet-4-6raw_only201.00001.00006.56843.45000.8969

Per-chat Breakdown

ChatModelStrategyRuns Success rateAvg elapsed (s)Mismatch counts
multiple_product_multiple_shipment_medium.jsonsonnet-4-6raw_only21.00008.1344[6, 7]
multiple_product_multiple_shipment_simple.jsonsonnet-4-6raw_only21.00006.3388[7, 1]
real_world_msgs_test_v1.jsonsonnet-4-6raw_only21.00005.9462[5, 5]
real_world_msgs_test_v2.jsonsonnet-4-6raw_only21.00007.1576[3, 3]
real_world_msgs_test_v3.jsonsonnet-4-6raw_only21.00007.0386[4, 4]
single_product_multiple_shipment_complex.jsonsonnet-4-6raw_only21.00008.3003[1, 0]
single_product_multiple_shipment_medium.jsonsonnet-4-6raw_only21.00006.4152[1, 4]
single_product_multiple_shipment_simple.jsonsonnet-4-6raw_only21.00005.7560[0, 0]
single_product_single_shipment_complex.jsonsonnet-4-6raw_only21.00005.8004[6, 6]
single_product_single_shipment_medium.jsonsonnet-4-6raw_only21.00004.7965[3, 3]

Top Mismatches (up to 100 runs)

ChatModelStrategyRunMismatch countSample mismatches
multiple_product_multiple_shipment_simple.jsonsonnet-4-6raw_only17
[
  {
    "path": "data[0].items[0].unit_price",
    "expected": 25.0,
    "actual": 250.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/BAG",
    "actual": ""
  },
  {
    "path": "data[0].items[0].total",
    "expected": 250.0,
    "actual": 2500.0
  },
  {
    "path": "data[0].items[1].description",
    "expected": "Assam tea",
    "actual": "Assam Tea"
  },
  {
    "path": "data[0].items[1].unit_price",
    "expected": 12.0,
    "actual": 240.0
  }
]
multiple_product_multiple_shipment_medium.jsonsonnet-4-6raw_only17
[
  {
    "path": "data",
    "expected_len": 1,
    "actual_len": 2
  },
  {
    "path": "data[0].items",
    "expected_len": 3,
    "actual_len": 1
  },
  {
    "path": "data[0].items[0].quantity_unit",
    "expected": "bags",
    "actual": "BAGS"
  },
  {
    "path": "data[0].items[0].unit_price",
    "expected": null,
    "actual": 25.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "",
    "actual": "USD/BAG"
  }
]
single_product_single_shipment_complex.jsonsonnet-4-6raw_only16
[
  {
    "path": "data[0].items[0].quantity_unit",
    "expected": "bags",
    "actual": "BAGS"
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/bag",
    "actual": "USD/BAG"
  },
  {
    "path": "data[0].items[0].shipment_date",
    "expected": "2025-11-28",
    "actual": "2026-11-28"
  },
  {
    "path": "data[0].items[0].shipping_address",
    "expected": "352 Indiana Jones St.",
    "actual": "352 Indiana Jones St"
  },
  {
    "path": "data[0].do_date",
    "expected": "2025-11-28",
    "actual": "2026-11-28"
  }
]
single_product_single_shipment_complex.jsonsonnet-4-6raw_only16
[
  {
    "path": "data[0].items[0].quantity_unit",
    "expected": "bags",
    "actual": "BAGS"
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/bag",
    "actual": "USD/BAG"
  },
  {
    "path": "data[0].items[0].shipment_date",
    "expected": "2025-11-28",
    "actual": "2026-11-28"
  },
  {
    "path": "data[0].items[0].shipping_address",
    "expected": "352 Indiana Jones St.",
    "actual": "352 Indiana Jones St"
  },
  {
    "path": "data[0].do_date",
    "expected": "2025-11-28",
    "actual": "2026-11-28"
  }
]
multiple_product_multiple_shipment_medium.jsonsonnet-4-6raw_only16
[
  {
    "path": "data",
    "expected_len": 1,
    "actual_len": 2
  },
  {
    "path": "data[0].items",
    "expected_len": 3,
    "actual_len": 1
  },
  {
    "path": "data[0].items[0].quantity_unit",
    "expected": "bags",
    "actual": "BAGS"
  },
  {
    "path": "data[0].items[0].unit_price",
    "expected": null,
    "actual": 300.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "",
    "actual": "USD/BAGS"
  }
]
real_world_msgs_test_v1.jsonsonnet-4-6raw_only15
[
  {
    "path": "data[0].items[0].description",
    "expected": "soy lecithin powder",
    "actual": "Soy Lecithin Powder"
  },
  {
    "path": "data[0].items[0].unit_price",
    "expected": 4.1,
    "actual": 4100.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/KG",
    "actual": "USD/MT"
  },
  {
    "path": "data[0].items[0].loading",
    "expected": "12MT/20'FCL",
    "actual": "2*12MT/20'FCL"
  },
  {
    "path": "data[0].shipping_method",
    "expected": "",
    "actual": "by sea"
  }
]
real_world_msgs_test_v1.jsonsonnet-4-6raw_only15
[
  {
    "path": "data[0].items[0].description",
    "expected": "soy lecithin powder",
    "actual": "Soy Lecithin Powder"
  },
  {
    "path": "data[0].items[0].unit_price",
    "expected": 4.1,
    "actual": 4100.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/KG",
    "actual": "USD/MT"
  },
  {
    "path": "data[0].items[0].loading",
    "expected": "12MT/20'FCL",
    "actual": "2 x 12MT/20'FCL"
  },
  {
    "path": "data[0].shipping_method",
    "expected": "",
    "actual": "by sea"
  }
]
single_product_multiple_shipment_medium.jsonsonnet-4-6raw_only14
[
  {
    "path": "data",
    "expected_len": 1,
    "actual_len": 2
  },
  {
    "path": "data[0].items",
    "expected_len": 2,
    "actual_len": 1
  },
  {
    "path": "data[0].items[0].description",
    "expected": "KNM Coffee",
    "actual": "KNM Coffee (Medium Roast)"
  },
  {
    "path": "data[0].shipping_address",
    "expected": "",
    "actual": "100 Finance Ave"
  }
]
real_world_msgs_test_v3.jsonsonnet-4-6raw_only14
[
  {
    "path": "data[0].items[0].description",
    "expected": "lecithin fat powder",
    "actual": "Lecithin Fat Powder"
  },
  {
    "path": "data[0].items[0].total",
    "expected": 96000.0,
    "actual": null
  },
  {
    "path": "data[0].items[1].description",
    "expected": "lecithin fat powder",
    "actual": "Lecithin Fat Powder"
  },
  {
    "path": "data[0].items[1].total",
    "expected": 144000.0,
    "actual": null
  }
]
real_world_msgs_test_v3.jsonsonnet-4-6raw_only14
[
  {
    "path": "data[0].items[0].description",
    "expected": "lecithin fat powder",
    "actual": "Lecithin Fat Powder"
  },
  {
    "path": "data[0].items[0].total",
    "expected": 96000.0,
    "actual": null
  },
  {
    "path": "data[0].items[1].description",
    "expected": "lecithin fat powder",
    "actual": "Lecithin Fat Powder"
  },
  {
    "path": "data[0].items[1].total",
    "expected": 144000.0,
    "actual": null
  }
]
single_product_single_shipment_medium.jsonsonnet-4-6raw_only13
[
  {
    "path": "data[0].items[0].shipment_date",
    "expected": "2026-05-28",
    "actual": "2026-05-31"
  },
  {
    "path": "data[0].do_date",
    "expected": "2026-05-28",
    "actual": "2026-05-31"
  },
  {
    "path": "data[0].payment_date",
    "expected": "Net 30 from delivery",
    "actual": ""
  }
]
single_product_single_shipment_medium.jsonsonnet-4-6raw_only13
[
  {
    "path": "data[0].items[0].shipment_date",
    "expected": "2026-05-28",
    "actual": "2026-05-31"
  },
  {
    "path": "data[0].do_date",
    "expected": "2026-05-28",
    "actual": "2026-05-31"
  },
  {
    "path": "data[0].payment_date",
    "expected": "Net 30 from delivery",
    "actual": ""
  }
]
real_world_msgs_test_v2.jsonsonnet-4-6raw_only13
[
  {
    "path": "data[1].items[0].unit_price",
    "expected": 4.2,
    "actual": 4200.0
  },
  {
    "path": "data[1].items[0].pricing_unit",
    "expected": "USD/KG",
    "actual": "USD/MT"
  },
  {
    "path": "data[1].items[0].loading",
    "expected": "",
    "actual": "18MT/40'FCL"
  }
]
real_world_msgs_test_v2.jsonsonnet-4-6raw_only13
[
  {
    "path": "data[1].items[0].unit_price",
    "expected": 4.2,
    "actual": 4200.0
  },
  {
    "path": "data[1].items[0].pricing_unit",
    "expected": "USD/KG",
    "actual": "USD/MT"
  },
  {
    "path": "data[1].items[0].loading",
    "expected": "",
    "actual": "18MT/40'FCL"
  }
]
single_product_multiple_shipment_medium.jsonsonnet-4-6raw_only11
[
  {
    "path": "data[0].do_date",
    "expected": "2026-05-31",
    "actual": ""
  }
]
single_product_multiple_shipment_complex.jsonsonnet-4-6raw_only11
[
  {
    "path": "data[0].do_date",
    "expected": "2026-03-10",
    "actual": ""
  }
]
multiple_product_multiple_shipment_simple.jsonsonnet-4-6raw_only11
[
  {
    "path": "data[0].items[1].description",
    "expected": "Assam tea",
    "actual": "Assam Tea"
  }
]