Few-shot Benchmark Report

Run ID: 20260507T185050Z | Generated UTC: 2026-05-07T18:51:11.569613+00:00

Configuration

{
  "models": [
    "sonnet-4-6"
  ],
  "fewshot_strategies": [
    "zero_shot"
  ],
  "chat_count": 20,
  "runs_per_chat": 2,
  "max_workers": 15,
  "raw_fewshot_count": 2,
  "raw_fewshot_labels_filter": [
    "[chats] multiple_product_multiple_shipment_complex.json",
    "[chats] single_product_multiple_shipment_complex.json",
    "[chats] single_product_single_shipment_complex.json",
    "[updates] update_change_quantity.json",
    "[updates] update_change_unit_price.json"
  ]
}

Model + Strategy Summary

ModelStrategyRunsSuccess rateAvg attempts Avg elapsed (s)Avg mismatch/expected runField match rate
sonnet-4-6zero_shot401.00001.00006.64614.10000.8750

Per-chat Breakdown

ChatModelStrategyRuns Success rateAvg elapsed (s)Mismatch counts
multiple_product_multiple_shipment_medium.jsonsonnet-4-6zero_shot41.00008.0648[4, 7, 4, 7]
multiple_product_multiple_shipment_simple.jsonsonnet-4-6zero_shot41.00006.3865[7, 7, 7, 7]
real_world_msgs_test_v1.jsonsonnet-4-6zero_shot41.00006.1915[5, 6, 6, 6]
real_world_msgs_test_v2.jsonsonnet-4-6zero_shot41.00007.0019[3, 3, 5, 3]
real_world_msgs_test_v3.jsonsonnet-4-6zero_shot41.00006.5626[9, 4, 4, 8]
single_product_multiple_shipment_complex.jsonsonnet-4-6zero_shot41.00008.0457[0, 0, 0, 0]
single_product_multiple_shipment_medium.jsonsonnet-4-6zero_shot41.00007.6290[4, 4, 4, 4]
single_product_multiple_shipment_simple.jsonsonnet-4-6zero_shot41.00006.0944[0, 0, 4, 0]
single_product_single_shipment_complex.jsonsonnet-4-6zero_shot41.00005.6090[4, 6, 6, 4]
single_product_single_shipment_medium.jsonsonnet-4-6zero_shot41.00004.8754[3, 3, 3, 3]

Top Mismatches (up to 100 runs)

ChatModelStrategyRunMismatch countSample mismatches
real_world_msgs_test_v3.jsonsonnet-4-6zero_shot19
[
  {
    "path": "data[0].items[0].description",
    "expected": "lecithin fat powder",
    "actual": "Lecithin Fat Powder"
  },
  {
    "path": "data[0].items[0].unit_price",
    "expected": 12.0,
    "actual": 12000.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/KG",
    "actual": "USD/MT"
  },
  {
    "path": "data[0].items[0].ship_term",
    "expected": "",
    "actual": "FOB"
  },
  {
    "path": "data[0].items[1].description",
    "expected": "lecithin fat powder",
    "actual": "Lecithin Fat Powder"
  }
]
real_world_msgs_test_v3.jsonsonnet-4-6zero_shot28
[
  {
    "path": "data[0].items[0].description",
    "expected": "lecithin fat powder",
    "actual": "Lecithin Fat Powder"
  },
  {
    "path": "data[0].items[0].unit_price",
    "expected": 12.0,
    "actual": 12000.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/KG",
    "actual": "USD/MT"
  },
  {
    "path": "data[0].items[0].ship_term",
    "expected": "",
    "actual": "FOB"
  },
  {
    "path": "data[0].items[1].description",
    "expected": "lecithin fat powder",
    "actual": "Lecithin Fat Powder"
  }
]
multiple_product_multiple_shipment_simple.jsonsonnet-4-6zero_shot17
[
  {
    "path": "data[0].items[0].unit_price",
    "expected": 25.0,
    "actual": 250.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/BAG",
    "actual": "USD/BAGS"
  },
  {
    "path": "data[0].items[0].total",
    "expected": 250.0,
    "actual": 2500.0
  },
  {
    "path": "data[0].items[1].description",
    "expected": "Assam tea",
    "actual": "Assam Tea"
  },
  {
    "path": "data[0].items[1].unit_price",
    "expected": 12.0,
    "actual": 240.0
  }
]
multiple_product_multiple_shipment_simple.jsonsonnet-4-6zero_shot27
[
  {
    "path": "data[0].items[0].unit_price",
    "expected": 25.0,
    "actual": 250.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/BAG",
    "actual": "USD/BAGS"
  },
  {
    "path": "data[0].items[0].total",
    "expected": 250.0,
    "actual": 2500.0
  },
  {
    "path": "data[0].items[1].description",
    "expected": "Assam tea",
    "actual": "Assam Tea"
  },
  {
    "path": "data[0].items[1].unit_price",
    "expected": 12.0,
    "actual": 240.0
  }
]
multiple_product_multiple_shipment_simple.jsonsonnet-4-6zero_shot17
[
  {
    "path": "data[0].items[0].unit_price",
    "expected": 25.0,
    "actual": 250.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/BAG",
    "actual": "USD/BAGS"
  },
  {
    "path": "data[0].items[0].total",
    "expected": 250.0,
    "actual": 2500.0
  },
  {
    "path": "data[0].items[1].description",
    "expected": "Assam tea",
    "actual": "Assam Tea"
  },
  {
    "path": "data[0].items[1].unit_price",
    "expected": 12.0,
    "actual": 240.0
  }
]
multiple_product_multiple_shipment_simple.jsonsonnet-4-6zero_shot27
[
  {
    "path": "data[0].items[0].unit_price",
    "expected": 25.0,
    "actual": 250.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/BAG",
    "actual": "USD/BAGS"
  },
  {
    "path": "data[0].items[0].total",
    "expected": 250.0,
    "actual": 2500.0
  },
  {
    "path": "data[0].items[1].description",
    "expected": "Assam tea",
    "actual": "Assam Tea"
  },
  {
    "path": "data[0].items[1].unit_price",
    "expected": 12.0,
    "actual": 240.0
  }
]
multiple_product_multiple_shipment_medium.jsonsonnet-4-6zero_shot27
[
  {
    "path": "data",
    "expected_len": 1,
    "actual_len": 2
  },
  {
    "path": "data[0].items",
    "expected_len": 3,
    "actual_len": 1
  },
  {
    "path": "data[0].items[0].quantity_unit",
    "expected": "bags",
    "actual": "BAGS"
  },
  {
    "path": "data[0].items[0].unit_price",
    "expected": null,
    "actual": 300.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "",
    "actual": "USD"
  }
]
multiple_product_multiple_shipment_medium.jsonsonnet-4-6zero_shot17
[
  {
    "path": "data",
    "expected_len": 1,
    "actual_len": 2
  },
  {
    "path": "data[0].items",
    "expected_len": 3,
    "actual_len": 1
  },
  {
    "path": "data[0].items[0].quantity_unit",
    "expected": "bags",
    "actual": "BAGS"
  },
  {
    "path": "data[0].items[0].unit_price",
    "expected": null,
    "actual": 300.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "",
    "actual": "USD"
  }
]
single_product_single_shipment_complex.jsonsonnet-4-6zero_shot26
[
  {
    "path": "data[0].items[0].quantity_unit",
    "expected": "bags",
    "actual": "BAGS"
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/bag",
    "actual": "USD/BAG"
  },
  {
    "path": "data[0].items[0].shipment_date",
    "expected": "2025-11-28",
    "actual": "2026-11-28"
  },
  {
    "path": "data[0].items[0].shipping_address",
    "expected": "352 Indiana Jones St.",
    "actual": "352 Indiana Jones St"
  },
  {
    "path": "data[0].do_date",
    "expected": "2025-11-28",
    "actual": "2026-11-28"
  }
]
single_product_single_shipment_complex.jsonsonnet-4-6zero_shot26
[
  {
    "path": "data[0].items[0].quantity_unit",
    "expected": "bags",
    "actual": "BAGS"
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/bag",
    "actual": "USD/BAG"
  },
  {
    "path": "data[0].items[0].shipment_date",
    "expected": "2025-11-28",
    "actual": "2026-11-28"
  },
  {
    "path": "data[0].items[0].shipping_address",
    "expected": "352 Indiana Jones St.",
    "actual": "352 Indiana Jones St"
  },
  {
    "path": "data[0].do_date",
    "expected": "2025-11-28",
    "actual": "2026-11-28"
  }
]
real_world_msgs_test_v1.jsonsonnet-4-6zero_shot16
[
  {
    "path": "data[0].items[0].description",
    "expected": "soy lecithin powder",
    "actual": "Soy Lecithin Powder"
  },
  {
    "path": "data[0].items[0].unit_price",
    "expected": 4.1,
    "actual": 4100.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/KG",
    "actual": "USD/MT"
  },
  {
    "path": "data[0].items[0].shipping_address",
    "expected": "",
    "actual": "Busan"
  },
  {
    "path": "data[0].items[0].loading",
    "expected": "12MT/20'FCL",
    "actual": "2x 12MT/20'FCL"
  }
]
real_world_msgs_test_v1.jsonsonnet-4-6zero_shot26
[
  {
    "path": "data[0].items[0].description",
    "expected": "soy lecithin powder",
    "actual": "Soy Lecithin Powder"
  },
  {
    "path": "data[0].items[0].unit_price",
    "expected": 4.1,
    "actual": 4100.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/KG",
    "actual": "USD/MT"
  },
  {
    "path": "data[0].items[0].shipping_address",
    "expected": "",
    "actual": "Busan"
  },
  {
    "path": "data[0].items[0].loading",
    "expected": "12MT/20'FCL",
    "actual": "2x 12MT/20'FCL"
  }
]
real_world_msgs_test_v1.jsonsonnet-4-6zero_shot16
[
  {
    "path": "data[0].items[0].description",
    "expected": "soy lecithin powder",
    "actual": "Soy Lecithin Powder"
  },
  {
    "path": "data[0].items[0].unit_price",
    "expected": 4.1,
    "actual": 4100.0
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/KG",
    "actual": "USD/MT"
  },
  {
    "path": "data[0].items[0].shipping_address",
    "expected": "",
    "actual": "Busan"
  },
  {
    "path": "data[0].items[0].loading",
    "expected": "12MT/20'FCL",
    "actual": "2x 12MT/20'FCL"
  }
]
real_world_msgs_test_v2.jsonsonnet-4-6zero_shot15
[
  {
    "path": "data[0].items[0].shipping_address",
    "expected": "",
    "actual": "Busan"
  },
  {
    "path": "data[1].items[0].unit_price",
    "expected": 4.2,
    "actual": 4200.0
  },
  {
    "path": "data[1].items[0].pricing_unit",
    "expected": "USD/KG",
    "actual": "USD/MT"
  },
  {
    "path": "data[1].items[0].shipping_address",
    "expected": "",
    "actual": "Busan"
  },
  {
    "path": "data[1].items[0].loading",
    "expected": "",
    "actual": "18MT/40'FCL"
  }
]
real_world_msgs_test_v1.jsonsonnet-4-6zero_shot25
[
  {
    "path": "data[0].items[0].description",
    "expected": "soy lecithin powder",
    "actual": "Soy Lecithin Powder"
  },
  {
    "path": "data[0].items[0].shipping_address",
    "expected": "",
    "actual": "Busan"
  },
  {
    "path": "data[0].items[0].loading",
    "expected": "12MT/20'FCL",
    "actual": "2x 12MT/20'FCL"
  },
  {
    "path": "data[0].items[0].total",
    "expected": 98400.0,
    "actual": null
  },
  {
    "path": "data[0].shipping_method",
    "expected": "",
    "actual": "by sea"
  }
]
single_product_single_shipment_complex.jsonsonnet-4-6zero_shot14
[
  {
    "path": "data[0].items[0].shipment_date",
    "expected": "2025-11-28",
    "actual": "2026-11-28"
  },
  {
    "path": "data[0].items[0].shipping_address",
    "expected": "352 Indiana Jones St.",
    "actual": "352 Indiana Jones St"
  },
  {
    "path": "data[0].do_date",
    "expected": "2025-11-28",
    "actual": "2026-11-28"
  },
  {
    "path": "data[0].shipping_address",
    "expected": "352 Indiana Jones St.",
    "actual": "352 Indiana Jones St"
  }
]
single_product_single_shipment_complex.jsonsonnet-4-6zero_shot14
[
  {
    "path": "data[0].items[0].shipment_date",
    "expected": "2025-11-28",
    "actual": "2026-11-28"
  },
  {
    "path": "data[0].items[0].shipping_address",
    "expected": "352 Indiana Jones St.",
    "actual": "352 Indiana Jones St"
  },
  {
    "path": "data[0].do_date",
    "expected": "2025-11-28",
    "actual": "2026-11-28"
  },
  {
    "path": "data[0].shipping_address",
    "expected": "352 Indiana Jones St.",
    "actual": "352 Indiana Jones St"
  }
]
single_product_multiple_shipment_simple.jsonsonnet-4-6zero_shot14
[
  {
    "path": "data[0].items[0].quantity_unit",
    "expected": "BAGS",
    "actual": "bags"
  },
  {
    "path": "data[0].items[0].pricing_unit",
    "expected": "USD/BAG",
    "actual": "USD/bag"
  },
  {
    "path": "data[0].items[1].quantity_unit",
    "expected": "BAGS",
    "actual": "bags"
  },
  {
    "path": "data[0].items[1].pricing_unit",
    "expected": "USD/BAG",
    "actual": "USD/bag"
  }
]
single_product_multiple_shipment_medium.jsonsonnet-4-6zero_shot24
[
  {
    "path": "data",
    "expected_len": 1,
    "actual_len": 2
  },
  {
    "path": "data[0].items",
    "expected_len": 2,
    "actual_len": 1
  },
  {
    "path": "data[0].items[0].description",
    "expected": "KNM Coffee",
    "actual": "KNM Coffee (medium roast)"
  },
  {
    "path": "data[0].shipping_address",
    "expected": "",
    "actual": "100 Finance Ave"
  }
]
single_product_multiple_shipment_medium.jsonsonnet-4-6zero_shot14
[
  {
    "path": "data",
    "expected_len": 1,
    "actual_len": 2
  },
  {
    "path": "data[0].items",
    "expected_len": 2,
    "actual_len": 1
  },
  {
    "path": "data[0].items[0].description",
    "expected": "KNM Coffee",
    "actual": "KNM Coffee (Medium Roast)"
  },
  {
    "path": "data[0].shipping_address",
    "expected": "",
    "actual": "100 Finance Ave"
  }
]
single_product_multiple_shipment_medium.jsonsonnet-4-6zero_shot24
[
  {
    "path": "data",
    "expected_len": 1,
    "actual_len": 2
  },
  {
    "path": "data[0].items",
    "expected_len": 2,
    "actual_len": 1
  },
  {
    "path": "data[0].items[0].description",
    "expected": "KNM Coffee",
    "actual": "KNM Coffee (medium roast)"
  },
  {
    "path": "data[0].shipping_address",
    "expected": "",
    "actual": "100 Finance Ave"
  }
]
single_product_multiple_shipment_medium.jsonsonnet-4-6zero_shot14
[
  {
    "path": "data",
    "expected_len": 1,
    "actual_len": 2
  },
  {
    "path": "data[0].items",
    "expected_len": 2,
    "actual_len": 1
  },
  {
    "path": "data[0].items[0].description",
    "expected": "KNM Coffee",
    "actual": "KNM Coffee (medium roast)"
  },
  {
    "path": "data[0].shipping_address",
    "expected": "",
    "actual": "100 Finance Ave"
  }
]
real_world_msgs_test_v3.jsonsonnet-4-6zero_shot24
[
  {
    "path": "data[0].items[0].description",
    "expected": "lecithin fat powder",
    "actual": "Lecithin Fat Powder"
  },
  {
    "path": "data[0].items[0].total",
    "expected": 96000.0,
    "actual": null
  },
  {
    "path": "data[0].items[1].description",
    "expected": "lecithin fat powder",
    "actual": "Lecithin Fat Powder"
  },
  {
    "path": "data[0].items[1].total",
    "expected": 144000.0,
    "actual": null
  }
]
real_world_msgs_test_v3.jsonsonnet-4-6zero_shot14
[
  {
    "path": "data[0].items[0].description",
    "expected": "lecithin fat powder",
    "actual": "Lecithin Fat Powder"
  },
  {
    "path": "data[0].items[0].total",
    "expected": 96000.0,
    "actual": null
  },
  {
    "path": "data[0].items[1].description",
    "expected": "lecithin fat powder",
    "actual": "Lecithin Fat Powder"
  },
  {
    "path": "data[0].items[1].total",
    "expected": 144000.0,
    "actual": null
  }
]
multiple_product_multiple_shipment_medium.jsonsonnet-4-6zero_shot14
[
  {
    "path": "data",
    "expected_len": 1,
    "actual_len": 2
  },
  {
    "path": "data[0].items",
    "expected_len": 3,
    "actual_len": 1
  },
  {
    "path": "data[0].items[0].quantity_unit",
    "expected": "bags",
    "actual": "BAGS"
  },
  {
    "path": "data[0].do_date",
    "expected": "2026-03-05",
    "actual": "2026-02-28"
  }
]
multiple_product_multiple_shipment_medium.jsonsonnet-4-6zero_shot24
[
  {
    "path": "data",
    "expected_len": 1,
    "actual_len": 2
  },
  {
    "path": "data[0].items",
    "expected_len": 3,
    "actual_len": 1
  },
  {
    "path": "data[0].items[0].quantity_unit",
    "expected": "bags",
    "actual": "BAGS"
  },
  {
    "path": "data[0].do_date",
    "expected": "2026-03-05",
    "actual": "2026-02-28"
  }
]
single_product_single_shipment_medium.jsonsonnet-4-6zero_shot13
[
  {
    "path": "data[0].items[0].shipment_date",
    "expected": "2026-05-28",
    "actual": "2026-05-31"
  },
  {
    "path": "data[0].do_date",
    "expected": "2026-05-28",
    "actual": "2026-05-31"
  },
  {
    "path": "data[0].payment_date",
    "expected": "Net 30 from delivery",
    "actual": ""
  }
]
single_product_single_shipment_medium.jsonsonnet-4-6zero_shot23
[
  {
    "path": "data[0].items[0].shipment_date",
    "expected": "2026-05-28",
    "actual": "2026-05-31"
  },
  {
    "path": "data[0].do_date",
    "expected": "2026-05-28",
    "actual": "2026-05-31"
  },
  {
    "path": "data[0].payment_date",
    "expected": "Net 30 from delivery",
    "actual": ""
  }
]
single_product_single_shipment_medium.jsonsonnet-4-6zero_shot13
[
  {
    "path": "data[0].items[0].shipment_date",
    "expected": "2026-05-28",
    "actual": "2026-05-31"
  },
  {
    "path": "data[0].do_date",
    "expected": "2026-05-28",
    "actual": "2026-05-31"
  },
  {
    "path": "data[0].payment_date",
    "expected": "Net 30 from delivery",
    "actual": ""
  }
]
single_product_single_shipment_medium.jsonsonnet-4-6zero_shot23
[
  {
    "path": "data[0].items[0].shipment_date",
    "expected": "2026-05-28",
    "actual": "2026-05-31"
  },
  {
    "path": "data[0].do_date",
    "expected": "2026-05-28",
    "actual": "2026-05-31"
  },
  {
    "path": "data[0].payment_date",
    "expected": "Net 30 from delivery",
    "actual": ""
  }
]
real_world_msgs_test_v2.jsonsonnet-4-6zero_shot13
[
  {
    "path": "data[1].items[0].unit_price",
    "expected": 4.2,
    "actual": 4200.0
  },
  {
    "path": "data[1].items[0].pricing_unit",
    "expected": "USD/KG",
    "actual": "USD/MT"
  },
  {
    "path": "data[1].items[0].loading",
    "expected": "",
    "actual": "18MT/40'FCL"
  }
]
real_world_msgs_test_v2.jsonsonnet-4-6zero_shot23
[
  {
    "path": "data[1].items[0].unit_price",
    "expected": 4.2,
    "actual": 4200.0
  },
  {
    "path": "data[1].items[0].pricing_unit",
    "expected": "USD/KG",
    "actual": "USD/MT"
  },
  {
    "path": "data[1].items[0].loading",
    "expected": "",
    "actual": "18MT/40'FCL"
  }
]
real_world_msgs_test_v2.jsonsonnet-4-6zero_shot23
[
  {
    "path": "data[1].items[0].unit_price",
    "expected": 4.2,
    "actual": 4200.0
  },
  {
    "path": "data[1].items[0].pricing_unit",
    "expected": "USD/KG",
    "actual": "USD/MT"
  },
  {
    "path": "data[1].items[0].loading",
    "expected": "",
    "actual": "18MT/40'FCL"
  }
]