[
  {
    "id": "counterfactual-f008",
    "app": "batbucks",
    "instruction": "What if I sold all my GameStop and Rivian and rolled the whole thing into VTI? Use cost basis (no live prices) and show me what my portfolio would look like before and after, with percentages. Use LibreOffice Calc to report this.",
    "apps_involved": [
      "BatBucks",
      "LibreOffice Calc"
    ],
    "category": "counterfactual",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent computes the GameStop + Rivian cost-basis sum from the seeded holdings (GME shares \u00d7 avg_cost + RIVN shares \u00d7 avg_cost) and reports that consolidation pool figure",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent reports the new total VTI position after the roll (existing VTI shares plus the additional shares funded by the GameStop+Rivian sell)",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Presents a before/after portfolio composition with percentages showing how each position changes",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Notes explicitly that the calculation uses cost basis (not market price) per the user's request",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent lists Michael's current BatBucks holdings (live count from /api/holdings) with shares times avg_cost before computing the GameStop and Rivian sell pool",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent divides the consolidation pool by VTI's avg_cost and reports the count of additional VTI shares to purchase from the GameStop+Rivian sell proceeds",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent produces the before/after report in LibreOffice Calc (opens Calc, enters the portfolio data and percentages into a spreadsheet, and saves or presents the file)",
          "type": "llm_judge",
          "weight": 0.15
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "hard_app-f003",
    "app": "batbucks",
    "instruction": "Time to rebalance my BatBucks portfolio into something a financial advisor wouldn't laugh at. I'm aiming for roughly 60% VTI / 20% SBUX / 20% DIS across the blue-chip bucket at cost basis (RIVN stays put \u2014 I'm not touching that one). Sell every share of GME I'm holding to free up cash, then use that plus my free cash to buy enough VTI to push the allocation toward 60%. Do this as a cost-basis rebalance, not live prices. Exact percentages aren't required \u2014 a reasonable approximation toward the 60/20/20 shape is the goal.",
    "apps_involved": [
      "BatBucks"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads all current BatBucks holdings (live count from /api/holdings) and records each ticker's avg_cost value at cost basis before placing any orders",
          "type": "llm_judge",
          "weight": 0.126375
        },
        {
          "criterion": "Agent sells every GME share Michael holds (full position liquidation) in a single or combined order",
          "type": "llm_judge",
          "weight": 0.189562
        },
        {
          "criterion": "Agent computes a VTI buy quantity that pushes VTI toward approximately 60% of the rebalanced blue-chip (VTI+SBUX+DIS) cost-basis total, treating the 60/20/20 target as an approximation (RIVN held out of the target bucket)",
          "type": "llm_judge",
          "weight": 0.157768
        },
        {
          "criterion": "Agent places a VTI buy order for the computed quantity",
          "type": "llm_judge",
          "weight": 0.189562
        },
        {
          "criterion": "Agent confirms new holdings row shows GME=0 and VTI shares increased",
          "type": "llm_judge",
          "weight": 0.157768
        },
        {
          "criterion": "Agent explicitly uses cost-basis framing (not market value)",
          "type": "llm_judge",
          "weight": 0.105379
        },
        {
          "criterion": "Agent verifies the GME sell order and VTI buy order persist in the BatBucks order history and that the holdings table reflects the rebalance with cash updated",
          "type": "llm_judge",
          "weight": 0.073585
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f004",
    "app": "batbucks",
    "instruction": "I want downside protection on every position I'm holding in BatBucks. Put a couple Stop order on each one at 15% below what I paid for it, just to give me a floor. Then build me a new watchlist called 'Oscar's Picks' with eight big-name ETFs in it. Make sure the stop-losses actually show up in my open orders and the watchlist is still there if I refresh.",
    "apps_involved": [
      "BatBucks"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens BatBucks and lists all current holdings (live count from /api/holdings) with their avg_cost values to plan the stop-loss orders",
          "type": "llm_judge",
          "weight": 0.090909
        },
        {
          "criterion": "Agent submits a Stop order (single stop price, not Stop-Limit) for every current BatBucks holding (live count from /api/holdings) at ~15% under each ticker's actual avg_cost (computed from holdings)",
          "type": "llm_judge",
          "weight": 0.545455
        },
        {
          "criterion": "Agent creates a new BatBucks watchlist named 'Oscar's Picks'",
          "type": "llm_judge",
          "weight": 0.109091
        },
        {
          "criterion": "Agent adds 8 big-name/mainstream ETF tickers (e.g., VTI, VOO, QQQ, SPY, DIA, IWM, VEA, VWO or comparable large-AUM broad-market ETFs) to the 'Oscar's Picks' watchlist in BatBucks",
          "type": "llm_judge",
          "weight": 0.127273
        },
        {
          "criterion": "Agent reloads BatBucks and confirms stop-loss orders are accounted for (open or filled) for each current holding \u2014 noting that any stop placed below the current trading price may fill immediately \u2014 and the 'Oscar's Picks' watchlist persists after refresh",
          "type": "llm_judge",
          "weight": 0.127273
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "preference_inference-f005",
    "app": "batbucks",
    "instruction": "Take a look at my brokerage portfolio. How bad is my meme-stock problem, and what would a sensible rebalance look like that doesn't totally abandon the names I actually like? Use cost basis, not market prices. While you're at it, peek at my old WUPHF investment agreement in ~/Downloads/ \u2014 that one was the OG mistake, and it sets the pattern for how I get into these things. Return the review of my portfolio in LibreOffice Writer.",
    "apps_involved": [
      "BatBucks"
    ],
    "category": "preference_inference",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent pulls Michael's BatBucks RIVN holding (shares and avg_cost from the holdings view / /api/holdings) and reports it as the source of his RIVN exposure",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent reports Michael's RIVN cost-basis dollar exposure (shares \u00d7 avg_cost from live BatBucks holdings) and expresses RIVN's share as a percent of the total cost-basis portfolio",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent frames the analysis using cost basis (avg_cost) per the user's request, not market value",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent proposes a RIVN trim plan consistent with Michael's historical consumer-brand bias (preserves DIS/SBUX-style narrative names rather than suggesting a pure-index dump)",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent reports the BatBucks holdings rows as the data source for the trim recommendation instead of inventing numbers",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent delivers the full RIVN trim answer including cost-basis concentration percent and recommendation without abandoning the task midway",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent opens ~/Downloads/WUPHF_Investment_Agreement.txt and references it as the founding 'bad bet' pattern that explains the current RIVN over-exposure",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent produces the portfolio review in a LibreOffice Writer document, as requested in the instruction",
          "type": "llm_judge",
          "weight": 0.12
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "pattern_inference"
  },
  {
    "id": "retrieval-f016",
    "app": "batbucks",
    "instruction": "What's my total BatBucks portfolio worth at cost basis across everything I'm holding, and how much cash is sitting in the account?",
    "apps_involved": [
      "BatBucks"
    ],
    "category": "retrieval",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reports Michael's BatBucks portfolio value at cost basis (sum of shares \u00d7 avg_cost across all current holdings, live count from /api/holdings)",
          "type": "llm_judge",
          "weight": 0.4
        },
        {
          "criterion": "Agent reports the cash balance in the BatBucks account separately from holdings value",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent uses cost basis (avg_cost) framing as the user requested, not market value",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent reconciles the reported BatBucks cost-basis total and cash balance against the holdings table rows instead of fabricating numbers",
          "type": "llm_judge",
          "weight": 0.15
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "personal_lookup"
  },
  {
    "id": "situated_action-f017",
    "app": "batbucks",
    "instruction": "Trim my GameStop position in half and roll the proceeds into VTI. I'm trying to be less of a degenerate about this.",
    "apps_involved": [
      "BatBucks"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent sells approximately half of the live GME position (roughly half the seeded share count, rounding up or down by one share) as the half-position trim into VTI",
          "type": "llm_judge",
          "weight": 0.256501
        },
        {
          "criterion": "Agent buys VTI using proceeds from the GME sale (not cash from elsewhere)",
          "type": "llm_judge",
          "weight": 0.256501
        },
        {
          "criterion": "Agent confirms both GameStop sell and VTI buy orders executed and verifies the updated BatBucks positions reflect the half-GME trim rebalance",
          "type": "llm_judge",
          "weight": 0.256193
        },
        {
          "criterion": "Agent opens BatBucks and reports the existing GameStop GME position share count (read from the live holdings table) before executing the half-trim sell into VTI",
          "type": "llm_judge",
          "weight": 0.230805
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "preference_inference-f014",
    "app": "cheskepdia",
    "instruction": "Look at my hotel and resort bookings on Cheskepdia. Am I a luxury-hotels guy or a budget-hotels guy, and what's my pattern?",
    "apps_involved": [
      "Cheskepdia"
    ],
    "category": "preference_inference",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent lists all four Cheskepdia trip bookings with their property names (Jamaica, NYC, Barbados, Philadelphia hotels)",
          "type": "llm_judge",
          "weight": 0.210684
        },
        {
          "criterion": "Agent categorizes each of the 4 Cheskepdia hotel bookings as luxury-tier or budget-tier and labels which group the pattern favors",
          "type": "llm_judge",
          "weight": 0.210684
        },
        {
          "criterion": "Agent flags Sandals (Montego Bay + Royal Barbados) as the dominant luxury pattern",
          "type": "llm_judge",
          "weight": 0.210684
        },
        {
          "criterion": "Agent identifies the Philadelphia Radisson booking from the four hotel trips as the business-travel outlier among Michael's luxury-hotels pattern",
          "type": "llm_judge",
          "weight": 0.157863
        },
        {
          "criterion": "Agent lists specific Cheskepdia booking rows (property name, total) from the four trips when stating the luxury-versus-budget verdict",
          "type": "llm_judge",
          "weight": 0.105242
        },
        {
          "criterion": "Agent delivers the full luxury-versus-budget guy verdict with the pattern summary, not stopping before answering the four-trip question",
          "type": "llm_judge",
          "weight": 0.104842
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "pattern_inference"
  },
  {
    "id": "situated_action-f028",
    "app": "cheskepdia",
    "instruction": "I'm feeling nostalgic. Book me an upcoming Scranton weekend, Friday through Sunday. Pick a well-rated property in town and book it.",
    "apps_involved": [
      "Cheskepdia"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent books a Scranton property on Cheskepdia with a future check-in date for the nostalgic weekend trip",
          "type": "llm_judge",
          "weight": 0.295892
        },
        {
          "criterion": "Agent's booked Scranton stay spans Friday check-in through Sunday check-out as requested for the upcoming weekend",
          "type": "llm_judge",
          "weight": 0.236652
        },
        {
          "criterion": "Agent browses the Cheskepdia Scranton search results and picks a well-rated property before booking",
          "type": "llm_judge",
          "weight": 0.236652
        },
        {
          "criterion": "Agent confirms the new Cheskepdia Scranton booking details (property, Friday-to-Sunday dates, total) after submitting the reservation for the nostalgic weekend",
          "type": "llm_judge",
          "weight": 0.230805
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "cua_only-f001",
    "app": "dinoco_airlines",
    "instruction": "Open the Dinoco Airlines app, check in my next upcoming flightand confirm that the boarding pass screen actually renders. Download it locally as well.",
    "apps_involved": [
      "Dinoco Airlines"
    ],
    "category": "cua_only",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens Dinoco Airlines and locates the next upcoming (non-cancelled) flight by departure_date in the live booking list",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent clicks the 'Check in' control on that upcoming flight's row, not on a past or cancelled flight",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent verifies that the boarding pass view actually renders after check-in (flight number, route, and seat/gate details visible)",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent downloads the boarding pass locally (e.g., saves the file to disk or confirms a download action was triggered)",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent reports the boarding pass details (flight number and route) it observed on the rendered pass",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "aggregation-f001",
    "app": "dinoco_airlines",
    "instruction": "How many miles have I actually flown on Dinoco over the past 12 months? Add up the miles from every completed Dinoco flight in that window. Give me a spreadsheet of each trip, and the miles for each one with a final row of the cumulative miles. Also include the upcoming trips and their milage.",
    "apps_involved": [
      "Dinoco Airlines"
    ],
    "category": "aggregation",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent filters Dinoco flight history to flights that are completed (not cancelled and not still-upcoming) within the past 12 months and sums their miles from the live flight history",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent also lists upcoming Dinoco flights and their mileage alongside the completed-flight data, as explicitly requested",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent produces a per-trip table or spreadsheet-style breakdown with each trip and its miles, ending with a cumulative total row",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent reports the count of completed Dinoco flights in the past 12 months and the total miles for that window (even 1-2 flights is a valid result)",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent delivers a single answer covering completed-flight totals, upcoming-flight mileage, and the per-trip breakdown together",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "situated_action-f001",
    "app": "dinoco_airlines",
    "instruction": "Book me a one-way Dinoco flight from AVP to JFK in Economy for next Friday with my MasterCard. Make sure you get a seat on the left and a flight in the evening or night. Can you check that the we got a confirmation email for the flight and pull it up?",
    "apps_involved": [
      "Dinoco Airlines",
      "HooliMail"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent creates a new one-way Dinoco booking with origin AVP and destination JFK in the Economy fare class for next Friday",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent selects an evening or night departure flight (departure time in the evening or after sunset) and picks a seat on the left side of the aircraft",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent pays with Michael's MasterCard (not a different saved card or a new card entry)",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent completes the booking and captures the new Dinoco confirmation number returned by the app",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent confirms a booking confirmation email was received and pulls it up as requested by Michael",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent delivers a single end-to-end answer: new confirmation number plus confirmation that the booking email was found",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "retrieval-f001",
    "app": "dinoco_airlines",
    "instruction": "What's my current FlyMiles loyalty tier on Dinoco and how many miles do I have in the bank?",
    "apps_involved": [
      "Dinoco Airlines"
    ],
    "category": "retrieval",
    "difficulty": "easy",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads Michael's FlyMiles loyalty tier directly from the live Dinoco profile (not inferred from email or other apps)",
          "type": "llm_judge",
          "weight": 0.4
        },
        {
          "criterion": "Agent reads Michael's current FlyMiles miles balance directly from the live Dinoco profile",
          "type": "llm_judge",
          "weight": 0.4
        },
        {
          "criterion": "Agent reports the loyalty tier and miles balance together in a single answer",
          "type": "llm_judge",
          "weight": 0.2
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "personal_lookup"
  },
  {
    "id": "hard_app-f001",
    "app": "dinoco_airlines",
    "instruction": "Cancel my next flight on Dinoco. After cancelling, verify we get an email for the cancellation, and pull it up. Let me know what you cancelled and how much credit we received.",
    "apps_involved": [
      "Dinoco Airlines",
      "HooliMail"
    ],
    "category": "hard_app",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent picks the next upcoming Dinoco booking that is still cancellable (not already past or cancelled) and cancels it through the Dinoco UI",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent confirms the cancellation confirmation email was received and pulls it up as requested",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent verifies a new travel-credit entry posts in Dinoco for the cancelled fare and reports the credit amount received",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent explicitly reports what flight was cancelled (route and/or flight number) and how much credit was received",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent delivers a single end-to-end answer: cancelled booking details, cancellation email confirmation, and travel-credit amount",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "aggregation-f019",
    "app": "etaxi",
    "instruction": "How much have I actually spent taking eTaxi to the Dunder Mifflin office this year? Just the commute rides. Sum up and report the total cost in a LibreOffice Calc sheet.",
    "apps_involved": [
      "eTaxi"
    ],
    "category": "aggregation",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent filters eTaxi to rides whose dropoff_address starts with 'Dunder Mifflin' (the unique-to-Office prefix that disambiguates the office from any home rides on the same street)",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent sums those rides to a YTD commute spend number",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent reports the Dunder Mifflin eTaxi commute ride count alongside the current-year YTD commute-only spend total",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent answers the how-much-spent question for Dunder Mifflin eTaxi commute rides with a single YTD dollar figure for the current year",
          "type": "llm_judge",
          "weight": 0.2
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "situated_action-f038",
    "app": "etaxi",
    "instruction": "Verify my saved work/office address on my ride app is set to Dunder Mifflin at 1725 Slough Ave (Suite 200, Scranton, PA). Report back what's currently saved \u2014 if it already matches, no action needed; if it's wrong or missing, fix it and save the change.",
    "apps_involved": [
      "eTaxi"
    ],
    "category": "situated_action",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opened eTaxi and actually looked at the saved work/office location entry (whichever label the app uses \u2014 'Work' or 'Office')",
          "type": "llm_judge",
          "weight": 0.350133
        },
        {
          "criterion": "Agent reported the current saved work/office address to Michael, explicitly confirming whether it matches Dunder Mifflin at 1725 Slough Ave (with or without Suite 200 / Scranton, PA suffix)",
          "type": "llm_judge",
          "weight": 0.350133
        },
        {
          "criterion": "Agent explicitly states whether the saved work/office address already matches '1725 Slough Ave' (in which case no action is needed) or required a correction, and applies the update only if a mismatch was found",
          "type": "llm_judge",
          "weight": 0.099733
        },
        {
          "criterion": "If the agent applied any correction, it explicitly saves/commits the updated work/office location entry so the change persists (not just a temporary edit); if no correction was needed, the agent explicitly says so and does not modify the entry",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent navigates into the eTaxi saved locations screen and inspects the existing work/office entry (labeled 'Work' or 'Office' in the app) before deciding whether to update it to Dunder Mifflin 1725 Slough Ave",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "retrieval-f020",
    "app": "files",
    "instruction": "Where did I save my Threat Level Midnight sequel notes? Should be tucked somewhere under ~/Documents/Projects. Give me a quick rundown of what's in there (plot, cast, whatever I had going). Can you draft up a script that I can use as a starting point? Make sure you include everyone from work as a character, for casting reasons.",
    "apps_involved": [
      "Files"
    ],
    "category": "retrieval",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent locates the Threat_Level_Midnight_Sequel_Notes.txt file in ~/Documents/Projects and reports its absolute path",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent reads the file and provides a quick rundown of the plot, cast, and other elements from the sequel notes",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent produces a draft script grounded in the sequel notes (not a generic action script) that uses the notes as a starting point",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent's draft script includes multiple Dunder Mifflin coworkers as named characters (casting reason requirement)",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent opens the Files app and navigates to ~/Documents/Projects to locate the file before summarizing or drafting",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "personal_lookup"
  },
  {
    "id": "retrieval-f032",
    "app": "files",
    "instruction": "I remember making a Dundies categorization document. Show me the Dundies 2026 categories I came up with. Where is the doc and what categories did I put down? Can you make a presentation with these categories using LibreOffice Impress so I can prepare for the Dundies? Message each of the candidates to make sure they show up to the Dundies.",
    "apps_involved": [
      "Files",
      "LibreOffice Impress",
      "HooliWork",
      "HooliMail"
    ],
    "category": "retrieval",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent locates Dundies_2026_Categories.txt in ~/Documents, reports its absolute path, and enumerates the Dundies 2026 categories from the document",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent creates a LibreOffice Impress presentation (.odp or .pptx) that includes the Dundies categories as slide content",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent sends messages (via HooliWork, HooliMail, or BuzzChat) to the Dundies candidates found in the document asking them to show up to the Dundies",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent opens the Dundies_2026_Categories.txt doc in the Files app and reads its content before producing the presentation or messages",
          "type": "llm_judge",
          "weight": 0.15
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "retrieval-f033",
    "app": "files",
    "instruction": "I'm working on my book, 'Somehow I Manage'. Where did I save Chapter 1, and can you give me a quick summary of what's in it? If you can write an extension for two more chapters in LibreOffice Writer, that would be great.",
    "apps_involved": [
      "Files",
      "LibreOffice Writer"
    ],
    "category": "retrieval",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent locates Somehow_I_Manage_Chapter_1.txt in ~/Documents and reports its absolute path",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent provides a faithful, coherent summary of Chapter 1's content in Michael's voice and tone",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent writes two additional chapters (Chapter 2 and Chapter 3, or equivalent) that continue the 'Somehow I Manage' book in Michael Scott's boss-lore style",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent opens or creates a LibreOffice Writer document (.odt or .docx) to write the two new chapter extensions",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent's chapter extensions maintain Michael's over-the-top, self-aggrandizing management-wisdom tone consistent with Chapter 1",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "personal_lookup"
  },
  {
    "id": "situated_action-f029",
    "app": "files",
    "instruction": "Open my Threat Level Midnight sequel notes file and add five new plot points for me. Big ones. Michael Scarn's back. Send out the revised plot notes to a handful of my friends from work, who could be used for casting.",
    "apps_involved": [
      "Files",
      "HooliMail",
      "HooliWork"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent locates and opens the existing Threat Level Midnight sequel notes file at ~/Documents/Projects/Threat_Level_Midnight_Sequel_Notes.txt via the Files app before adding new plot points",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent adds at least 5 new plot points to the existing file and saves it so the additions persist on disk",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent writes the 5 new plot points in Michael Scarn's Threat Level Midnight sequel tone (over-the-top action voice, big dramatic beats)",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent sends the revised plot notes (or a summary/link) to at least a handful of Dunder Mifflin coworkers via HooliWork, HooliMail, or BuzzChat as potential casting candidates",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent addresses the outreach to coworkers explicitly as potential casting candidates for the Threat Level Midnight sequel",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "hard_app-f013",
    "app": "hangrydash",
    "instruction": "I want a salad-forward lunch today on HangryDash from a restaurant in my order history that fits my vibe (salad-forward, nothing too spicy). Stick to my usual lunch budget. Apply any promo codes that surface and tip the driver 20%.",
    "apps_involved": [
      "HangryDash"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent picks a salad-forward HangryDash restaurant from Michael's live order history (e.g. Sweetgreen or a comparable salad/bowl-focused restaurant Michael has ordered from before)",
          "type": "llm_judge",
          "weight": 0.149381
        },
        {
          "criterion": "Agent picks a lunch within Michael's usual HangryDash lunch spending range that matches his salad/bowl dietary pattern",
          "type": "llm_judge",
          "weight": 0.124425
        },
        {
          "criterion": "Agent attempts to apply a HangryDash promo code at checkout for the salad-forward lunch order, or notes that no promo code is available if none surface",
          "type": "llm_judge",
          "weight": 0.107788
        },
        {
          "criterion": "Agent enters a 20% driver tip on the salad-forward HangryDash lunch order at checkout",
          "type": "llm_judge",
          "weight": 0.107788
        },
        {
          "criterion": "Agent places the order and sees it in history",
          "type": "llm_judge",
          "weight": 0.224
        },
        {
          "criterion": "Agent reviews HangryDash order history to confirm the chosen restaurant is salad-forward and one Michael has ordered from before",
          "type": "llm_judge",
          "weight": 0.074336
        },
        {
          "criterion": "Agent reviews HangryDash order history highlights to identify a salad-forward restaurant Michael has ordered from before picking the lunch order target",
          "type": "llm_judge",
          "weight": 0.106195
        },
        {
          "criterion": "Agent adds the picked salad-forward lunch item (priced within Michael's usual HangryDash budget) from the chosen salad-forward restaurant into the cart before proceeding to checkout",
          "type": "llm_judge",
          "weight": 0.106195
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f014",
    "app": "hangrydash",
    "instruction": "Rerun my last Cooper's Seafood House order from HangryDash with the same items I got last time. Apply any promo code that shows up and tip 20%. In the order notes, ask the kitchen to send extra napkins and crackers on the side, and to make sure anything hot is sent piping hot.",
    "apps_involved": [
      "HangryDash"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens HangryDash and navigates to the order history screen to surface the prior Cooper's Seafood House order list",
          "type": "llm_judge",
          "weight": 0.108268
        },
        {
          "criterion": "Agent locates your most recent Cooper's Seafood House order in history",
          "type": "llm_judge",
          "weight": 0.108268
        },
        {
          "criterion": "Agent clicks the Reorder button on the prior Cooper's Seafood House entry or opens a fresh HangryDash Cooper's Seafood House order seeded with the same line items",
          "type": "llm_judge",
          "weight": 0.108268
        },
        {
          "criterion": "Agent adds the exact same item(s) from Michael's most recent Cooper's Seafood House order (as shown in his HangryDash order history) into the new cart",
          "type": "llm_judge",
          "weight": 0.137795
        },
        {
          "criterion": "Agent adds an order note requesting extra napkins and crackers on the side and that hot items be sent piping hot",
          "type": "llm_judge",
          "weight": 0.139518
        },
        {
          "criterion": "Agent attempts to apply a HangryDash promo code at checkout for the Cooper's Seafood House reorder, or notes that no promo code is available if none surface",
          "type": "llm_judge",
          "weight": 0.113681
        },
        {
          "criterion": "Agent enters a 20% driver tip on the Cooper's Seafood House HangryDash reorder before submitting",
          "type": "llm_judge",
          "weight": 0.113681
        },
        {
          "criterion": "Agent places the order and verifies it in history",
          "type": "llm_judge",
          "weight": 0.170522
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "preference_inference-f025",
    "app": "hangrydash",
    "instruction": "What do I usually tip on food delivery, in dollars and as a percent? I want to set a smart default so I'm not thinking about it every order. Give me a report of all my orders and their tips in a LibreOffice spreadsheet.",
    "apps_involved": [
      "HangryDash",
      "LibreOffice Calc"
    ],
    "category": "preference_inference",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent queries tip_amount and subtotal at the order-level for the HangryDash food-delivery tip analysis, covering Michael's full order history",
          "type": "llm_judge",
          "weight": 0.21
        },
        {
          "criterion": "Agent computes mean tip in both $ and % terms",
          "type": "llm_judge",
          "weight": 0.21
        },
        {
          "criterion": "Agent identifies the modal tip percent (15%, 18%, 20%, or custom) from the HangryDash order history",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent recommends a defensible pre-fill default tip value grounded in the observed HangryDash tip pattern",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent creates a LibreOffice Calc spreadsheet (.ods or .xlsx) containing a per-order row report with at minimum the order identifier, tip amount, and subtotal columns for Michael's HangryDash order history",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent cites the HangryDash order-history tip rows as the data source for the recommended smart-default tip value, not inventing numbers",
          "type": "llm_judge",
          "weight": 0.07
        },
        {
          "criterion": "Agent delivers the full food-delivery tip answer in dollars and percent plus the smart-default recommendation and the LibreOffice spreadsheet",
          "type": "llm_judge",
          "weight": 0.07
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "pattern_inference"
  },
  {
    "id": "situated_action-f009",
    "app": "hangrydash",
    "instruction": "Order me my usual from Chili's \u2014 pick whichever item I've ordered most often from there, or just match my most recent Chili's order if multiple items are tied. Deliver to my place.",
    "apps_involved": [
      "HangryDash"
    ],
    "category": "situated_action",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent identifies Chili's as the target restaurant and opens HangryDash order history to find Michael's prior Chili's orders",
          "type": "llm_judge",
          "weight": 0.208333
        },
        {
          "criterion": "Agent places a Chili's order whose item(s) match Michael's most-frequent/modal item in his HangryDash Chili's order history (if two items tie for modal, either tied item is acceptable; matching the most-recent Chili's order is also acceptable as a tiebreaker)",
          "type": "llm_judge",
          "weight": 0.333334
        },
        {
          "criterion": "Agent selects a delivery address on the order that matches Michael's saved home/primary HangryDash address",
          "type": "llm_judge",
          "weight": 0.208333
        },
        {
          "criterion": "Agent adds the usual Chili's item(s) to the HangryDash cart and submits the order through the checkout flow so the order is placed",
          "type": "llm_judge",
          "weight": 0.25
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f033",
    "app": "hoolicalendar",
    "instruction": "Check my Thursday for any improv conflicts. If there's a Thursday improv class window stomping on a work meeting (and I cannot skip improv), find a different Thursday slot for the work meeting that doesn't touch the improv window, and add fifteen-minute buffers on both sides so I can transition without looking frazzled. I think the meeting has Andy and Jim, message them on HooliWork and send them updated invites on Hoolicalendar if needed. If no Thursday meeting actually overlaps an improv block, just confirm the all-clear and move on.",
    "apps_involved": [
      "HooliCalendar",
      "HooliWork"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens HooliCalendar and navigates to the Thursday day view to surface the scheduling conflict",
          "type": "llm_judge",
          "weight": 0.09
        },
        {
          "criterion": "Agent identifies any improv class event on the upcoming Thursday from live HooliCalendar (Scranton Improv Workshop, Improv Class, or any improv-tagged event) as the unmovable anchor",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent checks the Thursday work meetings against the improv class window and either identifies an overlapping meeting or explicitly reports that no Thursday meeting conflicts with the improv block",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "If a conflict was found, agent picks a non-conflicting Thursday time slot for that meeting that does not touch the improv window (or states that no move is needed if no conflict was found)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "If a conflict was found, agent moves the conflicting Thursday meeting in HooliCalendar to the new non-conflicting slot (or notes that no meeting needed to be moved)",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "If a meeting was moved, agent protects a 15-minute transition buffer immediately before the rescheduled Thursday meeting (either by creating a buffer event or by scheduling the meeting so at least 15 minutes of empty time precedes it)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "If a meeting was moved, agent protects a 15-minute transition buffer immediately after the rescheduled Thursday meeting (either by creating a buffer event or by scheduling the meeting so at least 15 minutes of empty time follows it)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "If a meeting was moved, agent sends updated HooliCalendar invites to the meeting attendees (Andy and Jim) after the move \u2014 OR, if the moved meeting has no attendees listed, agent explicitly notes that no attendees existed so no invites were needed",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "If a meeting was moved, agent messages Andy and Jim on HooliWork to notify them of the schedule change \u2014 OR notes that no HooliWork message was needed because no conflict existed or no move occurred",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent verifies the Thursday view in HooliCalendar \u2014 confirming either that the moved meeting and improv class show no overlap, or that no conflict existed to resolve",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "hard_app-f034",
    "app": "hoolicalendar",
    "instruction": "Build me a full Dundies prep schedule on my calendar. I want five events spread across the next two weeks: a categories brainstorm, a venue walkthrough, MC rehearsal, catering confirmation, and a dress rehearsal. Color-code them so they pop, invite Pam, Dwight, Kevin, Jim, and Holly on each, and if the event editor lets you set reminders, add 24-hour and 1-hour reminders as well.",
    "apps_involved": [
      "HooliCalendar"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens HooliCalendar and switches to a 2-week view to plan the Dundies prep schedule",
          "type": "llm_judge",
          "weight": 0.081967
        },
        {
          "criterion": "Agent creates the 'Dundies categories brainstorm' event in HooliCalendar early in week 1 of the prep schedule",
          "type": "llm_judge",
          "weight": 0.098361
        },
        {
          "criterion": "Agent creates the 'Venue walkthrough' event in HooliCalendar mid-week 1 for the Dundies prep schedule",
          "type": "llm_judge",
          "weight": 0.098361
        },
        {
          "criterion": "Agent creates the 'MC rehearsal' practice event late in week 1 of the HooliCalendar Dundies plan to lock in stage time",
          "type": "llm_judge",
          "weight": 0.098361
        },
        {
          "criterion": "Agent creates the 'Catering confirmation' event in HooliCalendar early in week 2 for the Dundies prep schedule",
          "type": "llm_judge",
          "weight": 0.098361
        },
        {
          "criterion": "Agent books the 'Dress rehearsal' final run-through near week 2's end on HooliCalendar so the Dundies cast walks the show before opening night",
          "type": "llm_judge",
          "weight": 0.098361
        },
        {
          "criterion": "Agent selects a single shared Dundies color on each of the 5 prep events so they pop in the 2-week calendar view",
          "type": "llm_judge",
          "weight": 0.098361
        },
        {
          "criterion": "Agent adds Pam, Dwight, Kevin, Jim, and Holly as attendees on each of the 5 Dundies prep events, or notes clearly that HooliCalendar's event editor does not expose an attendees picker and records the intended invitee list in the event body / description so Michael can follow up",
          "type": "llm_judge",
          "weight": 0.114754
        },
        {
          "criterion": "Agent attempts to add 24-hour and 1-hour reminders on each of the 5 Dundies prep events if the HooliCalendar event editor exposes per-event reminders",
          "type": "llm_judge",
          "weight": 0.114754
        },
        {
          "criterion": "Agent verifies all 5 Dundies prep events appear in the HooliCalendar 2-week view with the shared color and the five attendees",
          "type": "llm_judge",
          "weight": 0.098361
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "preference_inference-f023",
    "app": "hoolicalendar",
    "instruction": "Is my week front-loaded in the morning or packed in the evening? Pull up my calendar for this week and count events AM vs PM and give it to me in a LibreOffice Writer note.",
    "apps_involved": [
      "HooliCalendar",
      "LibreOffice Writer"
    ],
    "category": "preference_inference",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads HooliCalendar visually (cua) for the current week to gather the AM vs PM event data",
          "type": "llm_judge",
          "weight": 0.22
        },
        {
          "criterion": "Agent counts AM vs PM events per day with numeric totals drawn from the live HooliCalendar week view",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent computes and reports the week's AM:PM event ratio as a concrete numeric comparison",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent calls out any conspicuous recurring morning or evening blocks on Michael's HooliCalendar week (e.g., a daily morning standup, a recurring evening prep block)",
          "type": "llm_judge",
          "weight": 0.06
        },
        {
          "criterion": "Agent states an explicit morning-heavy vs evening-heavy verdict on Michael's calendar for the current week",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent saves the AM-vs-PM calendar analysis (per-day counts and final verdict) in a LibreOffice Writer document (.odt or .docx), as explicitly requested",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent reports AM vs PM event counts that reflect what is actually visible in the HooliCalendar week view rather than invented totals",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent finishes the AM-vs-PM calendar analysis and delivers both the per-day counts and the final morning-vs-evening verdict in the LibreOffice Writer note",
          "type": "llm_judge",
          "weight": 0.04
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "pattern_inference"
  },
  {
    "id": "hard_app-f007",
    "app": "hoolichat",
    "instruction": "My HooliChat is a mess and I want you to do some light housekeeping. In the Party Planning Committee, Dundie Awards Planning, and Finer Things Club groups, post a short recap message in each one summarizing whatever conversation exists \u2014 call out the three most useful messages if there are enough to pick from, otherwise just summarize the entire conversation in that group. Then in the Party Planning Committee group, send a status update reminder calling for any new agenda items.",
    "apps_involved": [
      "HooliChat"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent posts a recap message in HooliChat Party Planning Committee that summarizes the conversation (quoting or summarizing the 3 most useful recent messages if there are enough, otherwise summarizing the entire conversation)",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent posts a recap message in HooliChat Dundie Awards Planning that summarizes the conversation (quoting or summarizing the 3 most useful recent messages if there are enough, otherwise summarizing the entire conversation)",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent posts a recap message in HooliChat Finer Things Club that summarizes the conversation (quoting or summarizing the 3 most useful recent messages if there are enough, otherwise summarizing the entire conversation)",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent posts a separate status-update reminder in Party Planning Committee asking for any new agenda items",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent's recap messages persist in their respective HooliChat groups after refresh (4 new messages total visible in chat history)",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent opens HooliChat and enters each of the three groups (Party Planning Committee, Dundie Awards Planning, Finer Things Club) before composing the recap messages",
          "type": "llm_judge",
          "weight": 0.14
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "preference_inference-f021",
    "app": "hoolichat",
    "instruction": "When someone DMs me on HooliChat, how fast do I actually write back? Bucket my replies (under 5 minutes, under an hour, under a day, longer) and tell me if I'm a fast responder or a ghoster. Return the distribution of my responses in a LibreOffice Writer Chart.",
    "apps_involved": [
      "HooliChat"
    ],
    "category": "preference_inference",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent pulls paired timestamps of incoming DMs and Michael's replies from HooliChat DMs to compute how fast he writes back",
          "type": "llm_judge",
          "weight": 0.263253
        },
        {
          "criterion": "Agent groups the reply latencies into the <5min, <1h, <1d, and >1d buckets requested in the write-back prompt and reports a count per bucket",
          "type": "llm_judge",
          "weight": 0.263253
        },
        {
          "criterion": "Agent calls out the mode bucket and characterizes Michael's response speed",
          "type": "llm_judge",
          "weight": 0.210642
        },
        {
          "criterion": "Agent states a fast-responder vs ghoster verdict in plain English based on which reply-latency bucket dominates",
          "type": "llm_judge",
          "weight": 0.105221
        },
        {
          "criterion": "Agent derives the write-back latency buckets from HooliChat DM timestamps and shows the per-bucket reply counts instead of stating the fast-responder verdict with no supporting data",
          "type": "llm_judge",
          "weight": 0.105221
        },
        {
          "criterion": "Agent finishes the fast-vs-ghoster DM analysis and reports both the bucket counts and the final responder verdict",
          "type": "llm_judge",
          "weight": 0.05241
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "pattern_inference"
  },
  {
    "id": "situated_action-f026",
    "app": "hoolichat",
    "instruction": "DM Jim and propose a near-future evening for the next Finer Things Club meetup. Pick something this week or next. He'd be a great addition; he has taste.",
    "apps_involved": [
      "HooliChat"
    ],
    "category": "situated_action",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens or starts a HooliChat DM thread with Jim Halpert before composing the Finer Things Club invite",
          "type": "llm_judge",
          "weight": 0.26
        },
        {
          "criterion": "Agent picks a reasonable near-future evening (this week or next) as the proposed Finer Things Club meetup time",
          "type": "llm_judge",
          "weight": 0.26
        },
        {
          "criterion": "Agent sends the DM to Jim Halpert as a one-on-one HooliChat message rather than posting it in a group channel",
          "type": "llm_judge",
          "weight": 0.24
        },
        {
          "criterion": "Agent writes a HooliChat DM body that proposes the chosen near-future evening to Jim and references the Finer Things Club meetup",
          "type": "llm_judge",
          "weight": 0.24
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "aggregation-f037",
    "app": "hoolimail",
    "instruction": "Triage my inbox. It's overflowing, so give me a per-sender count and the top 10 senders filling it up.",
    "apps_involved": [
      "HooliMail"
    ],
    "category": "aggregation",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads Michael's HooliMail inbox messages \u2014 via the HooliMail app/API listing the Inbox folder including any sub-folders (Work, Travel, Receipts, etc.), OR by walking the Maildir filesystem at ~/Maildir/cur, ~/Maildir/new, and sub-folder cur/new dirs \u2014 but excludes the Sent folder and excludes Michael's own From: lines when counting senders",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent groups the inbox messages by sender email and counts how many messages each sender is filling it up with",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent reports the inbox senders as a top-10 ranked list with per-sender counts",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent presents the top-10 inbox senders as a readable ranked list with one sender per line and a visible count column",
          "type": "llm_judge",
          "weight": 0.125
        },
        {
          "criterion": "Agent lists approximately 10 inbox sender entries in the top-senders ranking (not just 5 or 7)",
          "type": "llm_judge",
          "weight": 0.125
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "cua_only-f015",
    "app": "hoolimail",
    "instruction": "Compose a quick new email to Pam in HooliMail with my signature ('World's Best Boss, Somehow I Manage') typed at the bottom of the body. Save it as a draft so I can review and send later.",
    "apps_involved": [
      "HooliMail"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens HooliMail and clicks Write to open a new compose window addressed to pam.beesly@dundermifflin.com",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent fills a non-empty subject line (e.g. 'Quick note') in the HooliMail compose window",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent writes a short message body in the new HooliMail draft to Pam",
          "type": "llm_judge",
          "weight": 0.17
        },
        {
          "criterion": "Agent appends the 'World's Best Boss, Somehow I Manage' signature line at the end of the email body before saving",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent saves the email to Pam as a draft in HooliMail (does not hit Send) so Michael can review later",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent's draft persists in the HooliMail Drafts folder after saving and the signature line is visible in the saved draft body",
          "type": "llm_judge",
          "weight": 0.05
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "preference_inference-f010",
    "app": "hoolimail",
    "instruction": "Help me auto-triage my inbox. Which senders do I usually reply to fast, and which ones do I let sit for days? I want to build a priority filter around that pattern.",
    "apps_involved": [
      "HooliMail"
    ],
    "category": "preference_inference",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent locates Michael's sent emails to build the auto-triage data set \u2014 either by walking the Maildir filesystem (.Sent/cur or .Sent/new with 'From: michael.scott' grep) or by querying the HooliMail Sent folder via /api/emails?folder=sent",
          "type": "llm_judge",
          "weight": 0.2106
        },
        {
          "criterion": "Agent computes a median reply latency per top sender from the paired received/sent timestamps and surfaces which senders Michael lets sit for days",
          "type": "llm_judge",
          "weight": 0.2106
        },
        {
          "criterion": "Agent identifies the fastest-reply and slowest-reply senders with supporting evidence",
          "type": "llm_judge",
          "weight": 0.2106
        },
        {
          "criterion": "Agent proposes a concrete auto-triage filter rule based on the data",
          "type": "llm_judge",
          "weight": 0.1578
        },
        {
          "criterion": "Agent derives the fast-vs-days reply pattern from Maildir timestamps rather than guessing, and names Michael as the sender of record",
          "type": "llm_judge",
          "weight": 0.1052
        },
        {
          "criterion": "Agent finishes the auto-triage build: the recommended priority filter rule is stated and references the fastest and slowest senders",
          "type": "llm_judge",
          "weight": 0.1052
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "pattern_inference"
  },
  {
    "id": "situated_action-f027",
    "app": "hoolimail",
    "instruction": "Start a draft email to David Wallace with my most recent quarter's sales summary. Pull a ballpark from my ~/Downloads/DunderMifflin_Q2_Sales_Forecast.txt so the numbers actually have a basis. Save it as a draft; do NOT hit send. I want to review it first. Do some analysis in LibreOffice Calc so I have some empirical analysis to send him as well.",
    "apps_involved": [
      "HooliMail",
      "Files",
      "LibreOffice Calc"
    ],
    "category": "situated_action",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads ~/Downloads/DunderMifflin_Q2_Sales_Forecast.txt to extract the sales figures that ground the email body",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent performs sales analysis in LibreOffice Calc using figures from the forecast file, producing a spreadsheet with empirical analysis (totals, trends, or comparisons) as an attachment or saved file for Michael to send along",
          "type": "llm_judge",
          "weight": 0.22
        },
        {
          "criterion": "Agent saves the most-recent-quarter sales summary email to David Wallace as a HooliMail draft and does not send it, so Michael can review it first",
          "type": "llm_judge",
          "weight": 0.23
        },
        {
          "criterion": "Draft is addressed to david.wallace@dundermifflin.com and the body references the most recent quarter's sales grounded in figures from the forecast file",
          "type": "llm_judge",
          "weight": 0.22
        },
        {
          "criterion": "Agent explicitly notes that it did not hit Send, leaving the HooliMail draft for Michael to review before sending",
          "type": "llm_judge",
          "weight": 0.18
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "hard_app-f019",
    "app": "hoolishop",
    "instruction": "Remember my most recent World's Best Boss Mug order from HooliShop? Pull up that order and start a return on it \u2014 it came out a little off. While you're at it, reorder a replacement (prefer a Prime-eligible listing so it ships quickly), and leave a five-star review on the product page praising the vendor anyway because I love them.",
    "apps_involved": [
      "HooliShop"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens HooliShop and navigates to the order history view",
          "type": "llm_judge",
          "weight": 0.07
        },
        {
          "criterion": "Agent locates the most-recent World's Best Boss Mug order (any variant) in HooliShop order history",
          "type": "llm_judge",
          "weight": 0.09
        },
        {
          "criterion": "Agent submits a return request on 1 unit of the most-recent HooliShop World's Best Boss Mug order (any variant) via the order-actions Return button",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent confirms the HooliShop return submission and the order's status reflects the return action",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent reorders 1 World's Best Boss Mug on HooliShop (using the same prior product if available, or the closest equivalent in the catalog if the original SKU is no longer listed)",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent reorders a Prime-eligible World's Best Boss Mug listing (HooliShop shows a PrimeBadge on fast-shipping items; picking a Prime-eligible product satisfies 'fastest shipping' since HooliShop has no separate speed selector)",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent places the HooliShop reorder so the replacement World's Best Boss Mug enters order history",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent posts a 5-star HooliShop product review with a short praising note on the World's Best Boss Mug product page (reviews on HooliShop are per-product via /api/products/[id]/reviews, not per-order)",
          "type": "llm_judge",
          "weight": 0.15
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f020",
    "app": "hoolishop",
    "instruction": "I want to give every member of the team a personalized World's Best Boss mug as a Dundies thank-you. Place a bulk order on HooliShop for about two dozen World's Best Boss mugs (look for the variant I've bought before) and apply any subscribe-and-save or bulk discount before checkout. Ship them to the office. Don't buy anything else.",
    "apps_involved": [
      "HooliShop"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent searches HooliShop and finds the World's Best Boss mug listing matching the variant Michael has bought before",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent enters a bulk HooliShop cart quantity of about 24 World's Best Boss mugs for the whole team",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent checks the unit price and the HooliShop cart total before proceeding to the bulk-discount step",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent applies a subscribe-save / bulk discount if surfaced at HooliShop checkout",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent updates the HooliShop ship-to to the Dunder Mifflin office at 1725 Slough Ave so the bulk mug order ships to the office (entering a custom office ship-to at checkout is acceptable since HooliShop's saved addresses may not include a pre-labeled Work entry)",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent places the HooliShop bulk World's Best Boss mug order and confirms it appears in Michael's order history",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent's cart at checkout contains only World's Best Boss mugs (about 24 units) and no other products \u2014 the instruction explicitly says 'Don't buy anything else'",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent reviews the final HooliShop bulk mug cart total at the checkout screen before submitting the order to the office address",
          "type": "llm_judge",
          "weight": 0.11
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f039",
    "app": "hoolishop",
    "instruction": "I want another copy of my own book \u2014 Somehow I Manage. Order me one, same as my last order of that book, and ship it home.",
    "apps_involved": [
      "HooliShop"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens HooliShop and searches for the 'Somehow I Manage' book listing (Michael's self-published leadership book) to surface the matching product page",
          "type": "llm_judge",
          "weight": 0.26
        },
        {
          "criterion": "Agent opens HooliShop order history and confirms a prior 'Somehow I Manage' book order (specifically the self-published Leadership Book, not the binding kit variant) before reordering \u2014 the most-recent leadership-book order suffices",
          "type": "llm_judge",
          "weight": 0.26
        },
        {
          "criterion": "Agent places a HooliShop order that contains exactly one 'Somehow I Manage' book matching the variant Michael previously bought",
          "type": "llm_judge",
          "weight": 0.26
        },
        {
          "criterion": "Agent selects Michael's saved home/primary address on HooliShop as the shipping address for the replacement book (the address labeled 'home' in his saved addresses)",
          "type": "llm_judge",
          "weight": 0.22
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f010",
    "app": "hooliwork",
    "instruction": "Turn on my HooliWork notifications so I don't miss anything important. For the sales, general, random, and party-planning-committee channels, make sure each channel's notification preference is set to 'all messages' (flip anything that isn't already there). Bookmark ten important messages across those channels while you're in there, and start a new thread in random about Pretzel Day to build hype.",
    "apps_involved": [
      "HooliWork"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent ensures the HooliWork #sales channel notification_pref is set to 'all' via the channel preferences panel (flipping it if it is not already 'all', or verifying + keeping it at 'all' if it already is)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent ensures the HooliWork #general channel notification_pref is set to 'all' via the channel preferences panel (flipping it if it is not already 'all', or verifying + keeping it at 'all' if it already is)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent ensures the HooliWork #random channel notification_pref is set to 'all' via the channel preferences panel (flipping it if it is not already 'all', or verifying + keeping it at 'all' if it already is)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent ensures the HooliWork #party-planning-committee channel notification_pref is set to 'all' via the channel preferences panel (flipping it if it is not already 'all', or verifying + keeping it at 'all' if it already is)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent saves at least 10 important HooliWork messages as bookmarks spread across the #sales, #general, #random, and #party-planning-committee channels",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent creates a new HooliWork thread in #random titled or about Pretzel Day",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent posts an on-topic opening message in the new Pretzel Day thread in #random that mentions Pretzel Day rather than placeholder text",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent verifies the notification-pref changes on the 4 channels, the 10 bookmarks, and the Pretzel Day thread all persist after a refresh",
          "type": "llm_judge",
          "weight": 0.15
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f004",
    "app": "hooliwork",
    "instruction": "Post a Motivational Monday message in our #random channel at work. You know how I like it: World's Best Boss style, kick the team's week off right. Pull a quote or two from my Motivational Quotes Collection in ~/Documents/ as a starting point so it doesn't feel made up.",
    "apps_involved": [
      "HooliWork",
      "Files"
    ],
    "category": "situated_action",
    "difficulty": "easy",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent posts to the HooliWork #random channel (not another channel)",
          "type": "llm_judge",
          "weight": 0.304287
        },
        {
          "criterion": "Agent writes the #random Monday post in Michael Scott's World's Best Boss motivational voice to kick the team's week off",
          "type": "llm_judge",
          "weight": 0.304287
        },
        {
          "criterion": "Message explicitly frames the post as a Monday motivation (week kickoff, start the week strong, Motivational Monday, etc.)",
          "type": "llm_judge",
          "weight": 0.130409
        },
        {
          "criterion": "Agent writes the #random Motivational Monday post as over-the-top self-aggrandizing World's Best Boss Michael Scott energy, not corporate HR boilerplate",
          "type": "llm_judge",
          "weight": 0.130583
        },
        {
          "criterion": "Agent reads ~/Documents/Motivational_Quotes_Collection.txt and pulls at least one quote into the Motivational Monday post (or directly cites the file as the source)",
          "type": "llm_judge",
          "weight": 0.130435
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f036",
    "app": "hooliwork",
    "instruction": "Post a Pretzel Day countdown for next Monday at 8:00 AM EST in the #party-planning-committee channel at work. Build the hype.",
    "apps_involved": [
      "HooliWork"
    ],
    "category": "situated_action",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent navigates to the HooliWork #party-planning-committee channel before composing the Pretzel Day countdown message",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent composes a Pretzel Day countdown message body that mentions Pretzel Day and includes hype language (days remaining, excitement, etc.)",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent attempts to schedule or post the Pretzel Day countdown message for next Monday at 8:00 AM EST (scheduled delivery or immediate post with the specified time noted)",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent posts or schedules the Pretzel Day countdown message in the HooliWork #party-planning-committee channel rather than another channel",
          "type": "llm_judge",
          "weight": 0.25
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f017",
    "app": "kwik_e_mart",
    "instruction": "Reorder my last run from a Kwik-E-Mart store I've used recently \u2014 pick a store I've ordered from most often OR most recently in my order history. If anything's out of stock, swap in reasonable substitutes (otherwise just keep what's available). Do a quick price check against another store in the app so I know I'm not getting fleeced, and schedule delivery for before end of day today.",
    "apps_involved": [
      "Kwik-E-Mart"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens Kwik-E-Mart and navigates to the order history view to surface a prior run at one of Michael's top-frequency stores or his most-recent store",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent picks a reorder anchor at one of Michael's top-frequency Kwik-E-Mart stores by order count (any store tied at the top by order count is acceptable) OR his most-recent Kwik-E-Mart order, and bases the reorder on that order's line items",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent starts a reorder and handles out-of-stock items appropriately \u2014 picks substitutions if any items are flagged OOS, or notes that all original items were available and proceeds without substitutions",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent opens a second store and compares prices",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent completes the price comparison and proceeds with the cheaper option (returns to the original store's cart if it is cheapest, or switches to the cheaper store)",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent selects the earliest available delivery slot at Kwik-E-Mart checkout for the chosen-store reorder",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent places the Kwik-E-Mart reorder and verifies it appears in the recent-orders history after checkout",
          "type": "llm_judge",
          "weight": 0.15
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f018",
    "app": "kwik_e_mart",
    "instruction": "Build me a Kwik-E-Mart shopping spree for Dundies catering for about a hundred people: drinks, chips, plates, plastic silverware, napkins, plenty of ice, the works. Use Wegman's. Check out with delivery to my home address and make sure the delivery window is at least a two-hour slot so I'm not stuck waiting by the door all afternoon.",
    "apps_involved": [
      "Kwik-E-Mart"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens Kwik-E-Mart, selects Wegman's as the store for the Dundies catering bulk order (instruction explicitly specifies Wegman's), and views any existing shopping lists before building the new catering list",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent creates or extends a Dundies Catering shopping list (or a 'Party Supplies' list) in Kwik-E-Mart for the catering items",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent adds 6+ Kwik-E-Mart shopping list categories (drinks, chips, plates, plastic silverware, napkins, ice) with quantities scaled for 100 people",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent moves the Dundies Catering Kwik-E-Mart shopping list contents into the cart",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent confirms delivery address is 1725 Slough Ave (Michael's home address) at Kwik-E-Mart checkout",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent picks a delivery window at least 2 hours wide at Kwik-E-Mart checkout so Michael is not waiting all afternoon",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent places the Kwik-E-Mart Dundies catering order and confirms the order total and delivery window persist in order history",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent verifies the Dundies Catering shopping list persists and displays in Kwik-E-Mart shopping_lists after checkout",
          "type": "llm_judge",
          "weight": 0.09
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "preference_inference-f009",
    "app": "kwik_e_mart",
    "instruction": "What's my favorite grocery store, and what do I actually buy there? Rank my stores and give me a quick profile of my shopping habits in LibreOffice Writer. ",
    "apps_involved": [
      "Kwik-E-Mart"
    ],
    "category": "preference_inference",
    "difficulty": "easy",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent identifies Michael's favorite Kwik-E-Mart grocery store as the top-by-order-count store from live order history (/api/orders), naming whichever store has the plurality of orders",
          "type": "llm_judge",
          "weight": 0.2632
        },
        {
          "criterion": "Agent reports the live order counts per store from Michael's Kwik-E-Mart order history (/api/orders), confirming the top-by-order-count store is ranked first ahead of the other stores Michael has used",
          "type": "llm_judge",
          "weight": 0.2106
        },
        {
          "criterion": "Agent characterizes the typical line items Michael buys there",
          "type": "llm_judge",
          "weight": 0.2106
        },
        {
          "criterion": "Agent writes a one-paragraph profile of Michael's Kwik-E-Mart grocery shopping habits at his favorite store",
          "type": "llm_judge",
          "weight": 0.1578
        },
        {
          "criterion": "Agent reads Kwik-E-Mart order-history line items and states them as the data source backing the favorite grocery store ranking and buying profile",
          "type": "llm_judge",
          "weight": 0.0842
        },
        {
          "criterion": "Agent delivers the favorite Kwik-E-Mart grocery store ranking and the buying-habits profile before ending the task",
          "type": "llm_judge",
          "weight": 0.0736
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "pattern_inference"
  },
  {
    "id": "situated_action-f018",
    "app": "kwik_e_mart",
    "instruction": "I need groceries from Price Chopper this week (changing it up from Wegmans). Reorder whatever I got the most recent time I shopped at Price Chopper on Kwik-E-Mart and have it delivered to home. Tip 20%, I feel bad for not tipping last time. Make sure we add oat milk and eggs if not in the order, need those!",
    "apps_involved": [
      "Kwik-E-Mart"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens Kwik-E-Mart and locates Michael's most-recent Price Chopper order in the order history (filtering by store and picking the most recent Price Chopper basket as the anchor)",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent copies the line items from that Price Chopper order into the new reorder cart at Price Chopper (not Wegmans or another store)",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent adds oat milk to the cart if it was not already in the Price Chopper reorder, and adds eggs if they were not already in the Price Chopper reorder, per Michael's explicit request",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent sets a 20% tip on the Kwik-E-Mart Price Chopper order at checkout",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent selects Michael's home address (1725 Slough Ave, Apt 4B) as the Kwik-E-Mart delivery destination and submits the Price Chopper reorder checkout",
          "type": "llm_judge",
          "weight": 0.2
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "cua_only-f011",
    "app": "libreoffice",
    "instruction": "Open my Dundies 2026 categories doc in Writer, bold the 'Whitest Sneakers' line, then add a new category called 'Fine Work Outfit' right after it. Save it when you're done.",
    "apps_involved": [
      "LibreOffice Writer"
    ],
    "category": "situated_action",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens LibreOffice Writer before loading the Dundies 2026 categories document for the bold and append edits",
          "type": "llm_judge",
          "weight": 0.166667
        },
        {
          "criterion": "Agent opens the Dundies_2026_Categories.txt file in LibreOffice Writer (not a terminal editor)",
          "type": "llm_judge",
          "weight": 0.166667
        },
        {
          "criterion": "Agent finds the 'Whitest Sneakers' line in the Dundies 2026 categories doc and selects it before applying formatting",
          "type": "llm_judge",
          "weight": 0.166667
        },
        {
          "criterion": "Agent applies bold formatting to the 'Whitest Sneakers' line during the editing session via Ctrl+B or the toolbar bold button (visible in screenshots; bold attribute will not persist when the .txt source file is saved back as plain text \u2014 accept screenshot evidence of the formatting being applied at any point in the session)",
          "type": "llm_judge",
          "weight": 0.166667
        },
        {
          "criterion": "Agent inserts a new 'Fine Work Outfit' category line immediately after the 'Whitest Sneakers' line in the Dundies 2026 categories doc",
          "type": "llm_judge",
          "weight": 0.166667
        },
        {
          "criterion": "Agent saves the edited Dundies_2026_Categories.txt file back to the same ~/Documents path in Writer, keeping the original text format if prompted",
          "type": "llm_judge",
          "weight": 0.166667
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "cua_only-f021",
    "app": "libreoffice",
    "instruction": "Write me a one-page Dundies 2026 invitation in Writer. I want a big bold title at the top, a bulleted list of this year's categories pulled from my Dundies categories doc, and a signature block that says 'Michael Scott, World's Best Boss'. Save it to my desktop so I can print a stack.",
    "apps_involved": [
      "LibreOffice Writer"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads ~/Documents/Dundies_2026_Categories.txt to retrieve this year's Dundies category names for the invitation",
          "type": "llm_judge",
          "weight": 0.132075
        },
        {
          "criterion": "Agent opens LibreOffice Writer and creates a new document for the Dundies 2026 invitation",
          "type": "llm_judge",
          "weight": 0.132075
        },
        {
          "criterion": "Agent writes a big bold centered top-of-page title reading 'THE DUNDIES 2026' in the Writer invitation document",
          "type": "llm_judge",
          "weight": 0.150943
        },
        {
          "criterion": "Agent writes a short Dundies 2026 invitation intro paragraph below the title in the Writer document",
          "type": "llm_judge",
          "weight": 0.132075
        },
        {
          "criterion": "Agent inserts a bulleted list of this year's Dundies categories pulled from Dundies_2026_Categories.txt into the Writer invitation document",
          "type": "llm_judge",
          "weight": 0.150943
        },
        {
          "criterion": "Agent adds a signature block reading 'Michael Scott, World's Best Boss' at the bottom of the Writer invitation document",
          "type": "llm_judge",
          "weight": 0.150943
        },
        {
          "criterion": "Agent saves the Dundies 2026 invitation as ~/Desktop/dundies_invite.odt fitting on a single page",
          "type": "llm_judge",
          "weight": 0.150943
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "cua_only-f024",
    "app": "libreoffice",
    "instruction": "Throw together a three-slide Impress deck for my Jamaica trip: one slide for the flight info, one for the Sandals hotel info, and one with a rough daily schedule. Save it into my Trips documents folder.",
    "apps_involved": [
      "LibreOffice Impress",
      "Dinoco Airlines",
      "Cheskepdia",
      "HooliMail"
    ],
    "category": "long_horizon",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent creates an Impress deck for Michael's Jamaica trip that contains exactly 3 slides (flight, hotel, daily schedule)",
          "type": "llm_judge",
          "weight": 0.183333
        },
        {
          "criterion": "Agent writes the Jamaica trip flight slide showing the flight number, departure time, confirmation code, and fare, all pulled from Michael's live Dinoco and/or HooliMail Jamaica trip sources (not invented)",
          "type": "llm_judge",
          "weight": 0.266667
        },
        {
          "criterion": "Agent writes the Sandals hotel slide showing Sandals Montego Bay, the live Cheskepdia confirmation number, the live check-in and check-out dates, and the live total price \u2014 all read from Michael's Cheskepdia booking",
          "type": "llm_judge",
          "weight": 0.266667
        },
        {
          "criterion": "Agent writes the third Jamaica trip slide as a rough day-by-day schedule spanning the live Sandals Montego Bay check-in/check-out window",
          "type": "llm_judge",
          "weight": 0.141667
        },
        {
          "criterion": "Agent saves the three-slide Jamaica trip Impress deck to ~/Documents/Trips/jamaica_itinerary.odp inside Michael's Trips documents folder",
          "type": "llm_judge",
          "weight": 0.141667
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "aggregation-f002",
    "app": "multi_app",
    "instruction": "It's early morning today. Give me a morning brief in LibreOffice Writer covering any priority email from this week, what's on my calendar today, anything that came in overnight on HooliChat or HooliWork, and any charges that hit my card since midnight.",
    "apps_involved": [
      "HooliMail",
      "HooliCalendar",
      "HooliChat",
      "HooliWork",
      "Gringotts"
    ],
    "category": "aggregation",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent surfaces important or unread HooliMail messages from the current week using 'from sender' or 'this week' framing (heuristic priority flagging by sender/subject is acceptable when no explicit priority/star flag is set)",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent enumerates today's HooliCalendar events by reading the UI",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent checks HooliChat and HooliWork for overnight messages and summarizes them",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent reviews Gringotts for transactions posted since today's midnight",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent delivers a consolidated morning brief that covers HooliMail priority email, HooliCalendar events, HooliChat/HooliWork overnight messages, and Gringotts charges",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent organizes the morning brief so the four sources (email, calendar, overnight HooliChat/HooliWork, Gringotts card charges) are each clearly distinguishable and scannable",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent includes all four morning-brief sources (HooliMail, HooliCalendar, HooliChat/HooliWork, Gringotts charges) before ending the task",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "aggregation-f004",
    "app": "multi_app",
    "instruction": "What did that recent Philadelphia overnight trip cost me all-in? It was a quick one-night hotel stay downtown. Give me the damage from the hotel and any related card charges.",
    "apps_involved": [
      "Cheskepdia",
      "Gringotts"
    ],
    "category": "aggregation",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens Cheskepdia and reads the live Philadelphia overnight hotel booking (property name as shown on the Cheskepdia booking record), reporting its total price and dates as shown live",
          "type": "llm_judge",
          "weight": 0.35
        },
        {
          "criterion": "Agent cross-references Gringotts around the Philadelphia overnight trip dates (as read from the Cheskepdia booking) for related card charges such as the hotel (the hotel charge on the card may appear under a different merchant name \u2014 e.g. Hilton Garden Inn \u2014 than the Cheskepdia booking property, and may post on or shortly after the checkout date), gas, or food",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent reports the combined Philadelphia trip total (hotel + any on-trip card charges) as the all-in damage Michael asked for",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent states the Philadelphia trip all-in damage as a single final dollar figure combining the hotel and incidentals (not assuming a flight)",
          "type": "llm_judge",
          "weight": 0.15
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "aggregation-f005",
    "app": "multi_app",
    "instruction": "Add up what I've already committed to the Barbados trip in a LibreOffice Calc sheet and flag any gap. I know the hotel is booked but I'm not sure if the flight charge actually hit yet.",
    "apps_involved": [
      "Cheskepdia",
      "Dinoco Airlines",
      "Gringotts"
    ],
    "category": "aggregation",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads the Sandals Royal Barbados hotel booking total from Cheskepdia to add up Michael's committed Barbados trip spend",
          "type": "llm_judge",
          "weight": 0.175
        },
        {
          "criterion": "Agent locates Michael's Barbados flight booking on Dinoco Airlines and notes its confirmation code",
          "type": "llm_judge",
          "weight": 0.175
        },
        {
          "criterion": "Agent identifies the Dinoco Airlines Barbados flight charge in Gringotts (matching by confirmation code DN-78558 or the AVP->BGI route) as a charge that has posted to the account",
          "type": "llm_judge",
          "weight": 0.175
        },
        {
          "criterion": "Agent identifies the Cheskepdia Sandals Royal Barbados hotel charge in Gringotts as a charge that has posted to the account",
          "type": "llm_judge",
          "weight": 0.175
        },
        {
          "criterion": "Agent computes the total committed Barbados trip spend by summing the posted Gringotts charges (flight and hotel) and correctly represents what has actually been charged to the card",
          "type": "llm_judge",
          "weight": 0.175
        },
        {
          "criterion": "Agent enters the committed trip totals into a LibreOffice Calc sheet and flags any remaining gap (such as incidentals or future charges not yet posted)",
          "type": "llm_judge",
          "weight": 0.125
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "aggregation-f009",
    "app": "multi_app",
    "instruction": "Build me a standup summary for today covering all the open issues on SprintBoard, what I posted on HooliWork, and what meetings I have including the takeaways from the recent all-hands (notes are saved on my Desktop as meeting_notes_allhands).",
    "apps_involved": [
      "SprintBoard",
      "HooliWork",
      "HooliCalendar",
      "Files"
    ],
    "category": "aggregation",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent enumerates all open SprintBoard issues across Michael's active projects (if none, reports 'no open issues' rather than fabricating)",
          "type": "llm_judge",
          "weight": 0.152174
        },
        {
          "criterion": "Agent reports Michael's HooliWork posts (if none, reports 'no posts' rather than fabricating)",
          "type": "llm_judge",
          "weight": 0.152174
        },
        {
          "criterion": "Agent reads Michael's HooliCalendar meetings visually from the UI and includes them in the standup summary",
          "type": "llm_judge",
          "weight": 0.152174
        },
        {
          "criterion": "Agent assembles Michael's standup summary so the three topics (SprintBoard open issues, HooliWork posts, HooliCalendar meetings) are each clearly distinguishable (using 'none' where applicable)",
          "type": "llm_judge",
          "weight": 0.152174
        },
        {
          "criterion": "Agent writes the standup summary as plain-text bullets that Michael can paste directly into a HooliWork channel to sound prepared",
          "type": "llm_judge",
          "weight": 0.173913
        },
        {
          "criterion": "Agent actually checks all three standup sources (SprintBoard open issues, HooliWork posts, HooliCalendar meetings) before assembling the summary, even if some are empty",
          "type": "llm_judge",
          "weight": 0.086957
        },
        {
          "criterion": "Agent opens ~/Desktop/meeting_notes_allhands.txt and includes the takeaways in the standup summary (not just the meeting title)",
          "type": "llm_judge",
          "weight": 0.130435
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "aggregation-f010",
    "app": "multi_app",
    "instruction": "Pull together everything I've spent or committed to the Dundies this year so far into a LibreOffice Calc Spreadsheet. Check my trophy orders, any mentions in the planning group chat, and the categories doc. Write up a report in LibreOffice Writer when you're done.",
    "apps_involved": [
      "HooliShop",
      "HooliChat",
      "Files",
      "LibreOffice Calc",
      "LibreOffice Writer"
    ],
    "category": "aggregation",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent searches HooliShop for any current-year trophy/award orders tied to the Dundies, reading the grand total if found and noting absence if none exists",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent scans HooliChat 'Dundie Awards Planning' group for any referenced costs",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent reads the Dundies 2026 categories doc (Documents/Dundies_2026_Categories.txt) in Files for any cost references",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent produces a clear Dundies budget rollup entered into a LibreOffice Calc spreadsheet",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent labels each Dundies cost line as known vs. unknown in the rollup and marks anything not yet committed (including the trophy line if no order exists)",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent checks all three Dundies sources (HooliShop for trophy/award orders, HooliChat 'Dundie Awards Planning' group, Dundies categories doc) before producing the rollup",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent writes a summary report in LibreOffice Writer (separate from the Calc spreadsheet) covering the Dundies budget findings",
          "type": "llm_judge",
          "weight": 0.12
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "aggregation-f011",
    "app": "multi_app",
    "instruction": "Put together a last-quarter performance review packet I can send to David Wallace. Pull my project progress (closed and open), my chat participation, any LockedIn posts, and relevant sent email threads. Pair it with a quick forward-look using the quarter sales forecast doc in ~/Downloads/ so David sees I'm thinking next quarter too. Make me look good. Build the actual perf-review packet as a LibreOffice Impress 4-slide deck (project progress / chat + LockedIn presence / forward-look / ask) \u2014 David likes visuals.",
    "apps_involved": [
      "SprintBoard",
      "HooliWork",
      "LockedIn",
      "HooliMail",
      "Files",
      "LibreOffice Impress"
    ],
    "category": "aggregation",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent summarizes last-quarter SprintBoard project progress across Michael's projects, covering both closed (completed) and open (in-progress) tasks for each project",
          "type": "llm_judge",
          "weight": 0.113636
        },
        {
          "criterion": "Agent reports HooliWork participation by top channels for the last completed quarter",
          "type": "llm_judge",
          "weight": 0.113636
        },
        {
          "criterion": "Agent reads LockedIn for last-quarter post count and engagement (even if only 1 post or zero, agent reports the live count and any engagement metrics for that post)",
          "type": "llm_judge",
          "weight": 0.113636
        },
        {
          "criterion": "Agent references Michael's sent mail threads from the live HooliMail Sent folder for last-quarter topics (reading whatever From: address the seeded sent messages actually use)",
          "type": "llm_judge",
          "weight": 0.113636
        },
        {
          "criterion": "Agent delivers a coherent last-completed-quarter performance review packet addressed to David Wallace pulling project progress, chat participation, LockedIn posts, and sent email threads",
          "type": "llm_judge",
          "weight": 0.113636
        },
        {
          "criterion": "Agent organizes the performance review packet so each of the four sources (SprintBoard projects, HooliWork chat, LockedIn posts, HooliMail threads) is clearly represented and cites specific numbers",
          "type": "llm_judge",
          "weight": 0.162337
        },
        {
          "criterion": "Agent checks all four last-quarter performance review sources (SprintBoard, HooliWork, LockedIn, HooliMail) before assembling the packet for David Wallace",
          "type": "llm_judge",
          "weight": 0.081169
        },
        {
          "criterion": "Agent reads ~/Downloads/DunderMifflin_Q2_Sales_Forecast.txt and folds a forward-look paragraph from it into the last-quarter perf-review packet",
          "type": "llm_judge",
          "weight": 0.097403
        },
        {
          "criterion": "Agent builds the last-quarter perf-review packet as a LibreOffice Impress 4-slide deck (one slide each: project progress / chat + LockedIn presence / forward-look from the sales forecast doc / explicit ask)",
          "type": "llm_judge",
          "weight": 0.090909
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "aggregation-f018",
    "app": "multi_app",
    "instruction": "Roll up my tax deductions for my accountant. Pull the line items (charitable, home-office days, 1099 income, W-2 gross/withholdings) from my in-progress Tax Year 2025 return in SpeedTax, and cross-check each line against my matching Tax_2025 source docs in ~/Documents (w2_summary.txt, 1099s.txt) to see whether the numbers in the in-progress return reconcile with the source docs. Don't guess the final refund on the in-progress return \u2014 that one isn't done yet.",
    "apps_involved": [
      "SpeedTax",
      "Files",
      "Gringotts"
    ],
    "category": "aggregation",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reports the charitable deductions total shown in SpeedTax for the in-progress Tax Year 2025 return in the tax deduction rollup",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent reports the home-office days count from the in-progress Tax Year 2025 return shown in SpeedTax in the tax deduction rollup",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent reports the 1099 income amount shown in SpeedTax for the in-progress Tax Year 2025 return and the payer name in the tax deduction rollup",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent reports W-2 gross income, federal withholding, and state withholding totals from the in-progress Tax Year 2025 return in the tax deduction rollup",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent reads the Tax_2025/w2_summary.txt and Tax_2025/1099s.txt docs in Files and compares each line item against the corresponding values in the in-progress 2025 return",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent declines to state a final refund or owed amount for the in-progress 2025 return and notes it is not done yet in the rollup",
          "type": "llm_judge",
          "weight": 0.125
        },
        {
          "criterion": "Agent completes the end-to-end tax deduction rollup covering charitable, home-office days, 1099, and W-2 totals (all anchored on Tax Year 2025) before ending the task",
          "type": "llm_judge",
          "weight": 0.125
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "aggregation-f020",
    "app": "multi_app",
    "instruction": "My credit card is deep in the hole. Tell me how fast I've been burning through it by giving me a monthly burn rate, then check if my BatBucks cash could cover a paydown.",
    "apps_involved": [
      "Gringotts",
      "BatBucks"
    ],
    "category": "aggregation",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads the current Gringotts credit card balance and credit limit from the credit account view",
          "type": "llm_judge",
          "weight": 0.186666
        },
        {
          "criterion": "Agent pulls the available Gringotts credit card charges and payments history (about 2-3 months of activity) to feed the monthly burn rate computation",
          "type": "llm_judge",
          "weight": 0.173334
        },
        {
          "criterion": "Agent computes a monthly net burn rate on the credit card from the available charges minus payments",
          "type": "llm_judge",
          "weight": 0.173334
        },
        {
          "criterion": "Agent calculates the runway in months from the current credit card balance to the credit limit at the computed monthly burn rate",
          "type": "llm_judge",
          "weight": 0.173334
        },
        {
          "criterion": "Agent reports the credit card burn rate and months-to-limit as specific numbers and states whether BatBucks cash can cover the paydown",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent completes all credit card burn-rate subtasks and delivers the BatBucks paydown check against the card balance before ending the task",
          "type": "llm_judge",
          "weight": 0.133332
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "aggregation-f029",
    "app": "multi_app",
    "instruction": "Who am I sending mail to the most? Give me my top 5 recipients and my top 5 recurring thread subjects over the past year in a LibreOffice Writer. Heads up: I think my sent mail is filed in the new folder, not Sent.",
    "apps_involved": [
      "HooliMail",
      "Files"
    ],
    "category": "aggregation",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent locates Michael's actual sent mail \u2014 either by recognizing the user's Maildir/new/ hint is wrong and walking ~/Maildir/.Sent/ on the filesystem, OR by listing the Sent folder via the HooliMail app/API \u2014 and notes that Maildir/new/ does not in fact contain sent mail",
          "type": "llm_judge",
          "weight": 0.1875
        },
        {
          "criterion": "Agent reports top 5 recipients of Michael's sent mail, ranked by frequency, with correct recipient names drawn from the sent-mail data (whether from Maildir or HooliMail API)",
          "type": "llm_judge",
          "weight": 0.1875
        },
        {
          "criterion": "Agent reports top 5 recurring sent thread subjects",
          "type": "llm_judge",
          "weight": 0.1875
        },
        {
          "criterion": "Agent reports the total sent mail count from michael.scott (citing the actual number found, via Maildir/.Sent/ or via HooliMail API Sent listing) over the past year",
          "type": "llm_judge",
          "weight": 0.1875
        },
        {
          "criterion": "Agent delivers two ranked HooliMail sent lists: top 5 recipients and top 5 recurring thread subjects over the past year",
          "type": "llm_judge",
          "weight": 0.125
        },
        {
          "criterion": "Agent completes the top 5 recipients ranking and top 5 recurring sent thread subjects ranking from HooliMail end-to-end",
          "type": "llm_judge",
          "weight": 0.125
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "aggregation-f031",
    "app": "multi_app",
    "instruction": "Compare how I was ordering HangryDash a few months ago to how I'm ordering it now. Monthly spend, top restaurants, and any trend I should know about. Report the numbers to me in a LibreOffice Calc SpreadSheet.",
    "apps_involved": [
      "HangryDash"
    ],
    "category": "aggregation",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent computes HangryDash monthly average spend from an earlier window of the order history (e.g., the first ~3 months of available data)",
          "type": "llm_judge",
          "weight": 0.17
        },
        {
          "criterion": "Agent computes recent-window HangryDash monthly average spend and total order count",
          "type": "llm_judge",
          "weight": 0.17
        },
        {
          "criterion": "Agent identifies the top HangryDash restaurant for each window based on the live order counts",
          "type": "llm_judge",
          "weight": 0.17
        },
        {
          "criterion": "Agent writes a HangryDash trend narrative covering the earlier-to-recent shift in monthly spend and top restaurants",
          "type": "llm_judge",
          "weight": 0.17
        },
        {
          "criterion": "Agent reports HangryDash monthly spend and top restaurants with concrete numbers as computed from the live order history for both windows",
          "type": "llm_judge",
          "weight": 0.17
        },
        {
          "criterion": "Agent completes the full HangryDash compare across the earlier and recent windows, including any trend the user should know about",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent records the comparison results (monthly spend per window, top restaurants, trend) in a LibreOffice Calc spreadsheet",
          "type": "llm_judge",
          "weight": 0.07
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "aggregation-f033",
    "app": "multi_app",
    "instruction": "Tell me every recurring or frequent Thursday obligation I have this quarter, including any improv-related events (class or workshop) that fall on Thursdays. Cross-reference HooliMail for any Thursday event confirmations too. I need to know what I'm locked into. Type up a summary of my Thursdays on a LibreOffice Writer Document.",
    "apps_involved": [
      "HooliCalendar",
      "HooliMail"
    ],
    "category": "aggregation",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads Thursday obligations this quarter from the HooliCalendar UI (not from an empty per-vm sqlite)",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent lists the recurring Thursday events as they appear live in HooliCalendar and calls out any improv-related events (class or workshop) that fall on Thursdays",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent cross-references HooliMail for any Thursday-event confirmations; if none exist, agent states so rather than fabricating",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent produces a Thursday obligations digest",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent delivers a Thursday obligations digest that lists every current-quarter Thursday event (recurring or one-off with improv) with date and time",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent covers Thursdays in the current quarter and lists each recurring or notable obligation (including any Thursday improv sessions)",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "aggregation-f040",
    "app": "multi_app",
    "instruction": "Give me a complete rollup of everything I pay regularly: subscriptions, improv class, anything recurring. If you happen to find any renewal notices in my HooliMail, factor those in \u2014 but Gringotts is the primary source. I want both monthly and annual totals.",
    "apps_involved": [
      "Gringotts",
      "HooliMail"
    ],
    "category": "aggregation",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent enumerates the recurring Gringotts bill_pay entries as they appear live (each payee with its amount and frequency) and reports a correct monthly subtotal from that live list",
          "type": "llm_judge",
          "weight": 0.162462
        },
        {
          "criterion": "Agent adds the improv class charge at the per-session amount and cadence as it appears in the Gringotts transactions ledger",
          "type": "llm_judge",
          "weight": 0.162462
        },
        {
          "criterion": "Agent scans for other recurring charges and reports findings",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent optionally checks HooliMail for renewal notices covering the recurring subscriptions and any other regular payments; absence of such emails is acceptable since the primary source is Gringotts",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent reports both monthly and annual combined totals",
          "type": "llm_judge",
          "weight": 0.124924
        },
        {
          "criterion": "Quality: Complete rollup is clear and totaled",
          "type": "llm_judge",
          "weight": 0.124924
        },
        {
          "criterion": "Agent completes a rollup of everything the user pays regularly including subscriptions, improv class, and any other recurring charges with monthly and annual totals",
          "type": "llm_judge",
          "weight": 0.125228
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "contradiction-f003",
    "app": "multi_app",
    "instruction": "My Dinoco profile says I have Gold Voyager and a decent chunk of miles, but honestly I haven't flown THAT much. Can you walk through the flights I've actually booked and tell me whether the mileage balance is realistic or inflated? Cross-check Gringotts Dinoco charges and any HooliMail boarding passes to be sure about what I've actually flown.",
    "apps_involved": [
      "Dinoco Airlines",
      "Gringotts",
      "HooliMail"
    ],
    "category": "contradiction",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads the Dinoco profile's actual loyalty tier and mileage balance directly from Dinoco Airlines (and notes if it differs from Michael's Gold Voyager claim)",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent enumerates every Dinoco flight in the history with route details",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent cross-references Gringotts for Dinoco charges to corroborate flight history",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent searches HooliMail and ~/Downloads for Dinoco boarding-pass and itinerary artifacts to corroborate the flights Michael has actually booked; if HooliMail has no matching entries the agent surfaces whatever local boarding-pass files exist",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent computes miles-earned-from-actual-flights and compares to the profile balance",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent flags the Dinoco mileage discrepancy between profile balance and booked flights and explains whether the gap is realistic or inflated",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent cites Dinoco Airlines flight history, Gringotts charges, and HooliMail itineraries as the sources reconciling Michael's booked flights with his profile balance",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent completes the walk-through of flights Michael has actually booked and reports whether the Dinoco profile's live mileage balance is realistic or inflated relative to the flights Michael has taken",
          "type": "llm_judge",
          "weight": 0.2
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f004",
    "app": "multi_app",
    "instruction": "I think I might be doubled up on GameStop. I've got it in BatBucks as an actual stock, and I also have a bet on OddsMarket for GameStop going up. Quantify my total GameStop exposure across both accounts. While you're at it, scan HooliChat and HooliWork for any GME mentions so I know who else I've been talking GameStop with.",
    "apps_involved": [
      "BatBucks",
      "OddsMarket",
      "HooliChat",
      "HooliWork"
    ],
    "category": "contradiction",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads the BatBucks GME equity position, listing shares count and average cost as the actual stock holding",
          "type": "llm_judge",
          "weight": 0.133336
        },
        {
          "criterion": "Agent locates the OddsMarket 'GameStop above $100' (will-gme-100-yearend) YES event contract and reports its live share count and cost basis",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent searches HooliChat for GME/GameStop mentions",
          "type": "llm_judge",
          "weight": 0.122222
        },
        {
          "criterion": "Agent searches HooliWork for GME/GameStop mentions",
          "type": "llm_judge",
          "weight": 0.122222
        },
        {
          "criterion": "Agent explicitly identifies the doubled-up directional thesis across two products",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent computes total GameStop exposure across both BatBucks shares and the OddsMarket GME YES bet in dollar terms",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent cites BatBucks GME holding data and OddsMarket GameStop event contract data as the sources quantifying exposure across both accounts",
          "type": "llm_judge",
          "weight": 0.11111
        },
        {
          "criterion": "Agent completes the GameStop exposure reconciliation end-to-end and reports whether Michael is doubled up on GME across BatBucks and OddsMarket",
          "type": "llm_judge",
          "weight": 0.111114
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f005",
    "app": "multi_app",
    "instruction": "I've been telling Jim that Oscar said VTI is the only smart buy, but I'm pretty sure my actual portfolio is mostly GameStop and Rivian. How concentrated am I in the meme stocks versus VTI? Return the percentages in a LibreOffice Calc Spreadsheet. Also check HooliChat or HooliWork for the Oscar/VTI conversation backing up my story. I want to know how big the lie is. If the lie is big enough, convert all of my GME and Rivian to VTI.",
    "apps_involved": [
      "BatBucks",
      "HooliChat",
      "HooliWork"
    ],
    "category": "contradiction",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent lists every BatBucks holding with shares and avg cost",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent computes cost-basis dollar values per position and total portfolio",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent computes the GME + RIVN concentration as a percentage of the portfolio",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent computes the VTI percentage and observes how small it is",
          "type": "llm_judge",
          "weight": 0.09
        },
        {
          "criterion": "Agent searches HooliChat or HooliWork for Oscar's investment advice references",
          "type": "llm_judge",
          "weight": 0.09
        },
        {
          "criterion": "Agent presents the contradiction between the 'Oscar said VTI is the only smart buy' story Michael told Jim and the actual meme-heavy GameStop plus Rivian BatBucks allocation",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent cites BatBucks holding data and any Oscar or VTI references from HooliChat as the sources sizing how concentrated Michael is in meme stocks versus VTI",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent completes the GameStop and Rivian concentration compare versus VTI end-to-end and states how big the stated-vs-actual gap is",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent makes a defensible judgment about whether the meme-stock concentration qualifies as 'big enough' to warrant conversion (e.g., GME+RIVN dominating the portfolio vs. VTI being a small fraction), and explicitly states that judgment",
          "type": "llm_judge",
          "weight": 0.09
        },
        {
          "criterion": "If agent determines the lie is big enough, agent executes (or simulates) selling all GME and RIVN and buying VTI with the proceeds; if agent determines the lie is not big enough, agent explicitly declines the conversion with a stated reason",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f006",
    "app": "multi_app",
    "instruction": "I've got the Jamaica trip AND the Barbados trip booked about four weeks apart. Given where my credit card balance is right now, can I actually afford both of these or am I about to max out? Be honest with me.",
    "apps_involved": [
      "Cheskepdia",
      "Dinoco Airlines",
      "Gringotts"
    ],
    "category": "contradiction",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent retrieves the Jamaica and Barbados hotel totals and dates from Cheskepdia",
          "type": "llm_judge",
          "weight": 0.133336
        },
        {
          "criterion": "Agent reads the Jamaica and Barbados flight booking costs from Dinoco Airlines for both trips booked about four weeks apart",
          "type": "llm_judge",
          "weight": 0.122222
        },
        {
          "criterion": "Agent reads the Gringotts credit card balance, credit limit, utilization, and checking account balance fields",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent computes the remaining Jamaica and Barbados trip cost still owed after hotel and flight charges already on the credit card",
          "type": "llm_judge",
          "weight": 0.122222
        },
        {
          "criterion": "Agent compares remaining Jamaica and Barbados trip cost to available Gringotts credit headroom to decide whether Michael can actually afford both or will max out",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent states the contradiction between booking the Jamaica and Barbados trips about four weeks apart and Michael's actual credit card balance position",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent cites Cheskepdia hotel totals, Dinoco Airlines flight costs, and Gringotts credit card balance as the sources answering whether Michael can actually afford both trips",
          "type": "llm_judge",
          "weight": 0.11111
        },
        {
          "criterion": "Agent completes the Jamaica and Barbados afford-both analysis end-to-end and states honestly whether Michael is about to max out his credit card",
          "type": "llm_judge",
          "weight": 0.111114
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f008",
    "app": "multi_app",
    "instruction": "The Team Morale Initiative Q2 project has a ton of open tasks, but I'm pretty sure I've been spending more time buying karaoke machines and Dundie trophies than actually closing tasks. Am I doing the morale work or just buying stuff?",
    "apps_involved": [
      "SprintBoard",
      "HooliShop",
      "HooliWork"
    ],
    "category": "contradiction",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent lists the open tasks on the Team Morale Initiative Q2 SprintBoard project, focusing on morale-specific items (Dundies planning, karaoke, catering, trophies) within whatever broader list the project contains",
          "type": "llm_judge",
          "weight": 0.133336
        },
        {
          "criterion": "Agent identifies Michael's assigned tasks and their age",
          "type": "llm_judge",
          "weight": 0.122222
        },
        {
          "criterion": "Agent lists morale-adjacent HooliShop orders (mugs, karaoke, trophies)",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent searches HooliWork for morale-project status updates from Michael",
          "type": "llm_judge",
          "weight": 0.122222
        },
        {
          "criterion": "Agent compares the SprintBoard morale-specific open tasks on Team Morale Initiative Q2 (Dundies planning, karaoke, catering, trophies) against the count of morale-adjacent HooliShop purchases including karaoke machines and Dundie trophies",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent presents the contradiction between Michael actually closing Team Morale Initiative Q2 tasks versus buying karaoke machines and Dundie trophies on HooliShop",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent cites SprintBoard Team Morale Initiative Q2 task state and HooliShop karaoke-trophy orders as the sources answering whether Michael is actually doing morale work",
          "type": "llm_judge",
          "weight": 0.11111
        },
        {
          "criterion": "Agent completes the morale work analysis end-to-end and answers whether Michael is actually closing tasks or just buying Dundie trophies and karaoke machines",
          "type": "llm_judge",
          "weight": 0.111114
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f011",
    "app": "multi_app",
    "instruction": "My tax return claims a charitable deduction for last year. Can you cross-check that against actual donations that hit my card over the past year? If you happen to spot any charity donation receipts in my HooliMail while you're at it, factor those in too. I want to know if the number holds up or if my accountant is exaggerating.",
    "apps_involved": [
      "SpeedTax",
      "Gringotts",
      "HooliMail"
    ],
    "category": "contradiction",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads the SpeedTax charitable deduction total claimed on the most-recent-filed tax return",
          "type": "llm_judge",
          "weight": 0.155552
        },
        {
          "criterion": "Agent searches Gringotts for donation transactions over the past year",
          "type": "llm_judge",
          "weight": 0.155556
        },
        {
          "criterion": "Agent sums actual donation transactions found",
          "type": "llm_judge",
          "weight": 0.155556
        },
        {
          "criterion": "Agent optionally checks HooliMail for charitable donation receipts; absence of such emails is acceptable since the primary cross-check uses Gringotts transactions",
          "type": "llm_judge",
          "weight": 0.155556
        },
        {
          "criterion": "Agent explicitly flags whether the deduction matches the evidence",
          "type": "llm_judge",
          "weight": 0.155556
        },
        {
          "criterion": "Agent cites the SpeedTax charitable deduction total and Gringotts card donation transactions as data sources backing the cross-check against the accountant's claim",
          "type": "llm_judge",
          "weight": 0.11111
        },
        {
          "criterion": "Agent delivers a final verdict on whether the charitable deduction claim holds up against Gringotts card donations without leaving the cross-check unresolved",
          "type": "llm_judge",
          "weight": 0.111114
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f012",
    "app": "multi_app",
    "instruction": "The current year's Dundies planning email thread is talking about ordering trophies. I have a vague memory of buying some a while back, but I'm not sure if it actually happened. Am I about to pay for trophies twice, or do I need to order them fresh? Check HooliShop for any prior trophy/award-style orders, then check SprintBoard for any Dundies trophy-ordering tasks too in case it's on there.",
    "apps_involved": [
      "HooliShop",
      "HooliMail",
      "SprintBoard"
    ],
    "category": "contradiction",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent searches HooliShop order history for any prior trophy or award-style purchase, reporting either a hit or a clean-absence finding",
          "type": "llm_judge",
          "weight": 0.133336
        },
        {
          "criterion": "Agent checks HooliShop for any current-year Dundie trophy order and reports whether one already exists",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent opens and reads the current-year Dundies email thread (seeded as 'THE DUNDIES ARE BACK BABY')",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent extracts the trophies-reusable question from the thread",
          "type": "llm_judge",
          "weight": 0.122222
        },
        {
          "criterion": "Agent checks SprintBoard for Dundie trophy ordering or planning tasks tied to the current-year Dundies email thread",
          "type": "llm_judge",
          "weight": 0.122222
        },
        {
          "criterion": "Agent presents a clear conclusion on whether trophies need to be ordered fresh or whether prior orders exist to reuse",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent cites HooliShop order records (or their absence) and the current-year Dundies planning email thread as the data sources backing the verdict",
          "type": "llm_judge",
          "weight": 0.11111
        },
        {
          "criterion": "Agent delivers a final answer on whether Michael is about to pay for Dundie trophies twice without leaving subtasks unresolved",
          "type": "llm_judge",
          "weight": 0.111114
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f013",
    "app": "multi_app",
    "instruction": "I bought multiple World's Best Boss mugs in a single HooliShop order. Why so many at once? Is it for gifts, replacements for breakage, or backups for myself? Figure it out. My LockedIn headline and any coworker chatter on HooliChat or HooliWork might hint at the why.",
    "apps_involved": [
      "HooliShop",
      "LockedIn",
      "HooliChat",
      "HooliWork"
    ],
    "category": "contradiction",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent pulls up Michael's World's Best Boss mug order in HooliShop with its date, quantity, and shipping address",
          "type": "llm_judge",
          "weight": 0.133336
        },
        {
          "criterion": "Agent notes the shipping address on the order and observes that all units shipped to one place rather than being routed to different recipients",
          "type": "llm_judge",
          "weight": 0.122222
        },
        {
          "criterion": "Agent recognizes the mugs were purchased together in a single order (i.e. bulk-buy, not a spaced-out replacement pattern)",
          "type": "llm_judge",
          "weight": 0.122222
        },
        {
          "criterion": "Agent reads the LockedIn headline as the identity signal for why Michael bought multiple World's Best Boss mugs",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent searches HooliChat/HooliWork for mug-related conversation",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent presents the most likely explanation for the bulk mug buy (gifts to coworkers, backup stock, replacements for breakage) with cited evidence",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent cites the HooliShop mug order record, shipping address, and LockedIn headline as the data sources backing the explanation",
          "type": "llm_judge",
          "weight": 0.11111
        },
        {
          "criterion": "Agent delivers a final explanation for Michael's bulk World's Best Boss mug purchase without leaving the gifts/backup/breakage question unresolved",
          "type": "llm_judge",
          "weight": 0.111114
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f014",
    "app": "multi_app",
    "instruction": "I've been telling people in DMs that I make 160K base, but my W-2 says something different. Can you check my W-2, the paycheck deposits on my card, and compare them to what I've been claiming? Note my LockedIn title/company as the identity I'm claiming against. I need to know how embarrassing this is.",
    "apps_involved": [
      "LockedIn",
      "SpeedTax",
      "HooliChat",
      "Gringotts"
    ],
    "category": "contradiction",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads the LockedIn title and company that Michael is claiming against the 160K base salary DM brag",
          "type": "llm_judge",
          "weight": 0.122222
        },
        {
          "criterion": "Agent reads the W-2 employer and gross wages from SpeedTax to compare against the 160K base salary claim",
          "type": "llm_judge",
          "weight": 0.133336
        },
        {
          "criterion": "Agent searches HooliChat DMs for Michael's 160K base salary brag message to compare against the W-2",
          "type": "llm_judge",
          "weight": 0.122222
        },
        {
          "criterion": "Agent sums past-year paycheck deposits from Gringotts",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent compares the W-2 gross wages, the paycheck card deposits, and the 160K base salary DM claim as three numeric data points",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent explicitly flags the discrepancy between Michael's 160K base salary DM claim and the W-2 gross wages on his paycheck",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent cites the SpeedTax W-2 gross wages, Gringotts paycheck deposits, and HooliChat DM salary brag as the data sources backing the check of Michael's 160K claim",
          "type": "llm_judge",
          "weight": 0.11111
        },
        {
          "criterion": "Agent delivers a final verdict comparing the 160K DM salary claim, W-2 wages, and paycheck card deposits without leaving the check unresolved",
          "type": "llm_judge",
          "weight": 0.111114
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f015",
    "app": "multi_app",
    "instruction": "I've got a TableFind reservation coming up \u2014 check my calendar for that exact date and time. Is there something else already scheduled that's going to conflict? Also peek at HooliChat in case we already rescheduled or confirmed this dinner.",
    "apps_involved": [
      "TableFind",
      "HooliCalendar",
      "HooliChat"
    ],
    "category": "contradiction",
    "difficulty": "easy",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads the next (earliest upcoming) TableFind reservation date and time (whichever venue Michael's next upcoming reservation actually points at in the seed) so the calendar conflict check can target that exact slot",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent navigates HooliCalendar to the correct date and time",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent checks every HooliCalendar event on the TableFind reservation date and reports any that overlap the reservation window (or explicitly states none do)",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent searches HooliChat for messages already confirming or rescheduling the upcoming TableFind reservation on the conflict date",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent explicitly states whether the upcoming TableFind reservation conflicts with any existing calendar event and, if a conflict exists, recommends a resolution (or confirms no conflict if calendar is clear at that slot)",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent cites the TableFind reservation record, HooliCalendar events, and HooliChat messages as the data sources backing the conflict check",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent delivers a final answer on whether the upcoming TableFind reservation conflicts with something already scheduled on Michael's calendar without leaving subtasks unresolved",
          "type": "llm_judge",
          "weight": 0.2
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f016",
    "app": "multi_app",
    "instruction": "My upcoming Jamaica flight leaves early morning that day. Check my calendar for anything scheduled that morning (meetings, improv, whatever), plus any SprintBoard tasks due that day and any HooliMail invites landing on that date, and flag the worst conflict so I can deal with it.",
    "apps_involved": [
      "Dinoco Airlines",
      "HooliCalendar",
      "SprintBoard",
      "HooliMail"
    ],
    "category": "contradiction",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads the Jamaica flight departure date and early morning time from Dinoco Airlines to anchor the calendar conflict check",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent lists every HooliCalendar event on the Jamaica flight departure date",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent identifies any calendar event (meetings, improv, anything) scheduled during the airport-arrival window before the Jamaica flight",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent checks SprintBoard for any tasks due on the Jamaica flight date that conflict with the early morning departure",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent searches HooliMail for meeting invites scheduled on the Jamaica flight date to surface any conflict with the early morning departure",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent flags the most serious conflict and proposes an action",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent cites Dinoco Airlines Jamaica flight, HooliCalendar events, SprintBoard tasks, and HooliMail invites as the data sources backing the morning conflict check",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent delivers a final answer flagging the worst calendar conflict with the early morning Jamaica flight without leaving subtasks unresolved",
          "type": "llm_judge",
          "weight": 0.2
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f017",
    "app": "multi_app",
    "instruction": "I've been telling Pam over DMs that I'm being financially disciplined this year. Pull my biggest single expense so far this year (check both Gringotts and my biggest discretionary BatBucks buy) and put it next to that claim. How bad does it look?",
    "apps_involved": [
      "Gringotts",
      "BatBucks",
      "HooliChat"
    ],
    "category": "contradiction",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent identifies the largest year-to-date Gringotts expense with date and amount",
          "type": "llm_judge",
          "weight": 0.155552
        },
        {
          "criterion": "Agent surfaces the BatBucks GME buy order from the live orders ledger as a separate biggest discretionary meme-stock spend alongside the largest Gringotts expense, naming the share count and dollar size it reads",
          "type": "llm_judge",
          "weight": 0.155556
        },
        {
          "criterion": "Agent searches HooliChat for Pam DMs about financial discipline",
          "type": "llm_judge",
          "weight": 0.155556
        },
        {
          "criterion": "Agent cites Michael's 'financially disciplined' DM claim to Pam next to the biggest year-to-date expense transaction",
          "type": "llm_judge",
          "weight": 0.155556
        },
        {
          "criterion": "Agent explicitly flags the stated-vs-actual contradiction between the 'financially disciplined' DM claim and Michael's biggest single expense year-to-date",
          "type": "llm_judge",
          "weight": 0.155556
        },
        {
          "criterion": "Agent cites Gringotts year-to-date expenses, BatBucks GME trade, and HooliChat Pam DMs as the data sources backing the discipline-vs-spend contradiction",
          "type": "llm_judge",
          "weight": 0.11111
        },
        {
          "criterion": "Agent delivers a final verdict on how bad the biggest year-to-date expense looks against Michael's 'financially disciplined' DM claim without leaving subtasks unresolved",
          "type": "llm_judge",
          "weight": 0.111114
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f021",
    "app": "multi_app",
    "instruction": "My LockedIn says I'm the Author of 'Somehow I Manage' but I'm pretty sure the file on my computer only has Chapter 1. Did I actually finish the book? Reconcile the claim with what's actually saved locally, and while you're at it, scan my Firefox history for any self-publishing/manuscript uploads and HooliMail for any publisher or editor replies.",
    "apps_involved": [
      "LockedIn",
      "Files",
      "Firefox",
      "HooliMail"
    ],
    "category": "contradiction",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads the LockedIn 'Author of Somehow I Manage' headline that Michael is claiming against the unfinished book",
          "type": "llm_judge",
          "weight": 0.155552
        },
        {
          "criterion": "Agent opens the Chapter 1 file and reports its scope",
          "type": "llm_judge",
          "weight": 0.155556
        },
        {
          "criterion": "Agent searches Firefox history for 'Somehow I Manage' book publishing or manuscript uploads that would back the LockedIn author claim",
          "type": "llm_judge",
          "weight": 0.155556
        },
        {
          "criterion": "Agent searches HooliMail for publisher or editor correspondence about 'Somehow I Manage' to verify whether Michael actually finished the book",
          "type": "llm_judge",
          "weight": 0.155556
        },
        {
          "criterion": "Agent states whether the author claim is supported or aspirational",
          "type": "llm_judge",
          "weight": 0.155556
        },
        {
          "criterion": "Agent cites the LockedIn 'Author of Somehow I Manage' headline and the local Chapter 1 book file as the data sources backing the author-claim reconciliation",
          "type": "llm_judge",
          "weight": 0.11111
        },
        {
          "criterion": "Agent delivers a final verdict on whether Michael actually finished 'Somehow I Manage' or only has Chapter 1 saved locally without leaving subtasks unresolved",
          "type": "llm_judge",
          "weight": 0.111114
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f022",
    "app": "multi_app",
    "instruction": "I've been sending a bunch of money via Zelle each month. Can you figure out who's getting it (use HooliChat DMs and HooliMail Zelle confirmations to put names to recipients), and then cross-check against my SpeedTax return for charitable donations or dependents claimed there? If SpeedTax only shows aggregate totals (not per-recipient names), call that out as inconclusive instead of guessing. I don't want to double-dip by accident.",
    "apps_involved": [
      "Gringotts",
      "HooliChat",
      "HooliMail",
      "SpeedTax"
    ],
    "category": "contradiction",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent lists all year-to-date Zelle sent transactions from Gringotts with recipients",
          "type": "llm_judge",
          "weight": 0.133336
        },
        {
          "criterion": "Agent groups Zelle recipients and computes monthly totals of money sent to figure out who is getting it each month",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent searches HooliChat for DMs where each Zelle recipient's name appears near payment language to cross-check the money bunch",
          "type": "llm_judge",
          "weight": 0.122222
        },
        {
          "criterion": "Agent locates Zelle confirmation emails in HooliMail",
          "type": "llm_judge",
          "weight": 0.122222
        },
        {
          "criterion": "Agent opens SpeedTax and reports what it can see about charitable deductions and dependent claims (per-recipient names if available, otherwise aggregate totals)",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent compares Zelle recipients against SpeedTax data \u2014 flagging overlap if SpeedTax exposes per-recipient names, OR flagging the cross-check as inconclusive if SpeedTax only shows aggregate totals without recipient identifiers",
          "type": "llm_judge",
          "weight": 0.133332
        },
        {
          "criterion": "Agent cites Gringotts Zelle transfers, HooliChat payment DMs, HooliMail confirmations, and SpeedTax charitable and dependent claims as the data sources backing the cross-check",
          "type": "llm_judge",
          "weight": 0.11111
        },
        {
          "criterion": "Agent delivers a final answer that is either (a) an overlap report with names if SpeedTax data permits or (b) an explicit inconclusive verdict citing SpeedTax's aggregate-only view, without leaving subtasks unresolved",
          "type": "llm_judge",
          "weight": 0.111114
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f023",
    "app": "multi_app",
    "instruction": "Audit my workplace info across my apps: eTaxi Work address, LockedIn company, and my W-2 employer. Do they all agree, or is one of them stale?",
    "apps_involved": [
      "eTaxi",
      "LockedIn",
      "SpeedTax"
    ],
    "category": "contradiction",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent records the eTaxi saved workplace address (the location labeled Office or Work in eTaxi)",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent reads the LockedIn employer company and office location to audit workplace info across apps",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent reads the SpeedTax W-2 employer name (and any employer contact info available on the return) to audit workplace info across apps",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent explicitly flags any stale workplace info across the three apps or confirms the eTaxi, LockedIn, and W-2 entries all agree",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent completes the task end-to-end without stopping early or leaving unresolved state",
          "type": "llm_judge",
          "weight": 0.2
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "contradiction-f024",
    "app": "multi_app",
    "instruction": "Jan rejected my latest expense report, but I think some of those same charges might be sitting in my tax return as deductions. The Chili's receipt in ~/Downloads/ was one of the rejected ones \u2014 start there. Find any rejected items that I'm accidentally double-dipping into the return? I do NOT want to get audited.",
    "apps_involved": [
      "HooliMail",
      "SpeedTax",
      "Gringotts",
      "Files"
    ],
    "category": "contradiction",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads the expense-report rejection thread in HooliMail",
          "type": "llm_judge",
          "weight": 0.115944
        },
        {
          "criterion": "Agent extracts rejected line items with amounts and reasons",
          "type": "llm_judge",
          "weight": 0.115941
        },
        {
          "criterion": "Agent opens the local Expense_Report_Q1_2026 file and cross-references its charges against the rejected items to find any double-dip into SpeedTax deductions",
          "type": "llm_judge",
          "weight": 0.10628
        },
        {
          "criterion": "Agent lists SpeedTax deductions claimed on the most-recent in-progress return that would cover the rejected-expense period",
          "type": "llm_judge",
          "weight": 0.115941
        },
        {
          "criterion": "Agent identifies overlap between rejected items and SpeedTax deductions (or explicitly confirms no explicit deduction-field overlap and surfaces the George Foreman reference in the return's deduction_notes as the one edge case)",
          "type": "llm_judge",
          "weight": 0.115941
        },
        {
          "criterion": "Agent recommends removing the overlap items from the return",
          "type": "llm_judge",
          "weight": 0.10628
        },
        {
          "criterion": "Agent cites the HooliMail rejection thread, local Expense_Report_Q1_2026 file, and SpeedTax deductions as the data sources backing the double-dipping audit risk check",
          "type": "llm_judge",
          "weight": 0.096617
        },
        {
          "criterion": "Agent delivers a final verdict on whether any of the rejected charges are accidentally sitting on the SpeedTax return as deductions without leaving subtasks unresolved",
          "type": "llm_judge",
          "weight": 0.096621
        },
        {
          "criterion": "Agent opens the Chili's receipt file in ~/Downloads/ and explicitly cross-checks that line against the deduction categories on the most-recent SpeedTax return covering the receipt date",
          "type": "llm_judge",
          "weight": 0.130435
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "counterfactual-f001",
    "app": "multi_app",
    "instruction": "Hypothetical: what if I bail on Jamaica 72 hours before the flight? Figure out the damage across flight, hotel, and cards. How much do I actually get back versus eat? And what do I need to cancel or clean up on my calendar, in email, and with my airport ride? Give me an action plan.",
    "apps_involved": [
      "Dinoco Airlines",
      "Cheskepdia",
      "Gringotts",
      "HooliCalendar",
      "HooliMail",
      "eTaxi"
    ],
    "category": "counterfactual",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent identifies Michael's live Dinoco Jamaica flight (flight number + confirmation as seeded in the airline app) and reads the cancel-modal generic policy copy (travel-credit vs original-payment per cabin class) from the trip-detail UI when describing cancellation impact (no requirement to quote dollar-precise refund fees, since those are not surfaced in the UI)",
          "type": "llm_judge",
          "weight": 0.157894
        },
        {
          "criterion": "Agent identifies the Sandals Montego Bay Jamaica hotel reservation with its Cheskepdia booking total and explains its cancellation window within 72 hours of the flight",
          "type": "llm_judge",
          "weight": 0.157894
        },
        {
          "criterion": "Agent computes total Jamaica cancellation exposure as the sum of the Dinoco flight cost basis and the Cheskepdia hotel total, and splits it into refundable-back vs non-refundable-eat buckets across flight, hotel, and cards using whatever policy text the UI surfaces (do not penalize the absence of dollar-precise refund fees, which the trip-detail UI does not expose)",
          "type": "llm_judge",
          "weight": 0.157894
        },
        {
          "criterion": "Enumerates calendar cleanup for events in the Jamaica trip window",
          "type": "llm_judge",
          "weight": 0.157894
        },
        {
          "criterion": "Agent lists the HooliMail threads and recipients that need a Jamaica cancellation notice sent as part of the cleanup plan",
          "type": "llm_judge",
          "weight": 0.105264
        },
        {
          "criterion": "Agent checks eTaxi for any airport ride tied to the Jamaica departure; if a ride is scheduled, includes it in the cancellation action plan; if none exists, explicitly notes the absence and flags that a replacement ride would need to be arranged",
          "type": "llm_judge",
          "weight": 0.105264
        },
        {
          "criterion": "Produces an organized, ordered action plan ('do this in the next 72h')",
          "type": "llm_judge",
          "weight": 0.157898
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "counterfactual-f002",
    "app": "multi_app",
    "instruction": "What if my credit card gets declined when the hotel tries to settle my Barbados booking? Walk me through my rescue options (checking, savings, brokerage cash, prediction-market balance), and check HooliMail for any Cheskepdia grace-period or alternate-card hold policy before ranking them by which one hurts me least. I need to know my play.",
    "apps_involved": [
      "Gringotts",
      "BatBucks",
      "OddsMarket",
      "Cheskepdia",
      "HooliMail"
    ],
    "category": "counterfactual",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent sums liquid rescue cash across Gringotts checking, Gringotts savings, BatBucks portfolio cash, and OddsMarket balance using the live app values",
          "type": "llm_judge",
          "weight": 0.175438
        },
        {
          "criterion": "States the Gringotts credit-card headroom (credit limit minus current balance) and confirms that amount is insufficient by itself to cover the Barbados hotel settle",
          "type": "llm_judge",
          "weight": 0.131579
        },
        {
          "criterion": "Agent computes whether Gringotts checking+savings together cover the Cheskepdia Barbados hotel settle total and reports the resulting residual balance with a qualitative risk note tied to that residual",
          "type": "llm_judge",
          "weight": 0.175438
        },
        {
          "criterion": "Agent proposes BatBucks partial liquidation as a fallback rescue option and cites the agent's at-cost brokerage value",
          "type": "llm_judge",
          "weight": 0.131579
        },
        {
          "criterion": "Ranks options by pain and picks a recommended path",
          "type": "llm_judge",
          "weight": 0.219299
        },
        {
          "criterion": "Agent reviews the Cheskepdia Barbados booking confirmation email in HooliMail for any grace-period or alternate-card hold policy hints; if no dedicated grace-period policy email exists, explicitly notes the absence before ranking the rescue options",
          "type": "llm_judge",
          "weight": 0.166668
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "counterfactual-f003",
    "app": "multi_app",
    "instruction": "What if the Dundies venue ends up costing double what I planned? Where do I cut: trophies, catering, somewhere else? Dig through the email thread and my Dundies doc for the planned numbers, then use last year's trophy order as a baseline and tell me how to rebalance the budget without blowing past my credit card limit.",
    "apps_involved": [
      "HooliMail",
      "HooliShop",
      "HangryDash",
      "Gringotts",
      "Files"
    ],
    "category": "counterfactual",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent pulls the planned Dundies budget numbers from the DUNDIES email thread and the Documents/Dundies_2026_Categories.txt file",
          "type": "llm_judge",
          "weight": 0.200001
        },
        {
          "criterion": "Uses the most recent HooliShop Dundie trophy order as baseline for trophy cost scaling (agent reads the actual order total from HooliShop rather than hardcoding a number)",
          "type": "llm_judge",
          "weight": 0.150001
        },
        {
          "criterion": "Agent computes the 2x Dundies venue cost delta against the planned venue budget and surfaces the dollar overrun",
          "type": "llm_judge",
          "weight": 0.200001
        },
        {
          "criterion": "Proposes specific dollar cuts in trophies and/or catering",
          "type": "llm_judge",
          "weight": 0.200001
        },
        {
          "criterion": "Reality-checks the plan against the Gringotts credit-card available-credit (computed from credit_limit minus current balance)",
          "type": "llm_judge",
          "weight": 0.150001
        },
        {
          "criterion": "Agent frames the Dundies trophy vs catering rebalance with concrete cut amounts tied to the venue overrun, not a generic suggestion",
          "type": "llm_judge",
          "weight": 0.099996
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "counterfactual-f004",
    "app": "multi_app",
    "instruction": "What if I quit improv class? I pay them each month but I also got a 1099 from the same place, so am I a student or am I teaching? Figure out which, then tell me the real net annual savings if I walk away. Also how much calendar time do I get back, and flag any HooliMail threads where I'd need to send a cancellation notice.",
    "apps_involved": [
      "Gringotts",
      "SpeedTax",
      "HooliCalendar",
      "HooliMail"
    ],
    "category": "counterfactual",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent annualizes the improv class cost using the actual per-session charge amount and cadence from the Gringotts transactions ledger in the quit-savings calculation",
          "type": "llm_judge",
          "weight": 0.175438
        },
        {
          "criterion": "Agent flags the student-vs-teaching contradiction that Michael pays Scranton Improv Academy tuition every month yet receives a 1099 from the same payer",
          "type": "llm_judge",
          "weight": 0.2193
        },
        {
          "criterion": "Provides a net annual savings figure that accounts for both the avoided charge AND the lost 1099 income",
          "type": "llm_judge",
          "weight": 0.175438
        },
        {
          "criterion": "Agent enumerates the calendar time recovered per week if Michael quits the improv class and walks away",
          "type": "llm_judge",
          "weight": 0.131578
        },
        {
          "criterion": "Recommends a concrete action path (keep vs quit vs clarify role first)",
          "type": "llm_judge",
          "weight": 0.131578
        },
        {
          "criterion": "Agent checks HooliMail for any Scranton Improv Academy threads that would need a cancellation notice; if no such thread exists, explicitly notes the absence and states that cancellation would go via direct contact rather than an email reply",
          "type": "llm_judge",
          "weight": 0.166667
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "counterfactual-f005",
    "app": "multi_app",
    "instruction": "What if I doubled my GameStop position by buying enough shares to match what I already own at my average cost? Can I actually afford that given my checking, savings, and how much headroom I have left on the credit card? Give me a go or no-go with the concentration-risk caveat.",
    "apps_involved": [
      "BatBucks",
      "Gringotts"
    ],
    "category": "counterfactual",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent computes the GameStop doubling purchase cost as (current GME shares) * (GME avg_cost) from the live BatBucks holdings table",
          "type": "llm_judge",
          "weight": 0.175441
        },
        {
          "criterion": "Agent reads the live BatBucks cash balance, compares it against the computed doubling cost, and names the resulting shortfall",
          "type": "llm_judge",
          "weight": 0.175438
        },
        {
          "criterion": "Agent reads the Gringotts Sapphire Preferred credit-card available headroom (limit minus current balance) and flags that the card alone cannot fund the GameStop purchase",
          "type": "llm_judge",
          "weight": 0.175438
        },
        {
          "criterion": "Agent evaluates checking + savings + card feasibility for the GameStop purchase using the live Gringotts balances and explains the consequences of draining each account",
          "type": "llm_judge",
          "weight": 0.175438
        },
        {
          "criterion": "Agent delivers a go/no-go recommendation on doubling the GameStop position and notes the concentration-risk caveat",
          "type": "llm_judge",
          "weight": 0.131578
        },
        {
          "criterion": "Agent verifies whether Gringotts checking plus savings alone covers the computed GameStop doubling cost without touching the credit card, using the live Gringotts balances, and reports the residual balance",
          "type": "llm_judge",
          "weight": 0.166666
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "counterfactual-f010",
    "app": "multi_app",
    "instruction": "What if I upgraded all my upcoming flights to first class? Assume it's about six hundred bucks extra per flight. Can I cover that out of checking and savings without touching the credit card? And which flight would honestly be most + least worth upgrading?",
    "apps_involved": [
      "Gringotts",
      "Dinoco Airlines"
    ],
    "category": "counterfactual",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent sums Gringotts checking + savings into a single liquid-cash figure as the first-class upgrade budget (no credit card)",
          "type": "llm_judge",
          "weight": 0.166667
        },
        {
          "criterion": "Agent reads the actual count of upcoming Dinoco flights and computes total first-class upgrade cost using Michael's stated $600-per-flight assumption (rubric accepts either the $600 ballpark or the actual Dinoco upgrade fee if shown)",
          "type": "llm_judge",
          "weight": 0.125022
        },
        {
          "criterion": "Agent concludes whether the first-class upgrade is feasible without touching the credit card and reports the residual balance after the upgrade spend",
          "type": "llm_judge",
          "weight": 0.208312
        },
        {
          "criterion": "Agent lists the upcoming Dinoco flights by confirmation number + duration when evaluating which upgrades are worth it",
          "type": "llm_judge",
          "weight": 0.166667
        },
        {
          "criterion": "Agent differentiates upgrade priority across the upcoming Dinoco flights with a defensible criterion (e.g., longest-duration as highest-value and shortest as least-value on diminishing returns, OR if the upcoming flights are all comparable long-haul routes, prioritizing by travel companion or time-of-day factors) instead of upgrading every flight blindly",
          "type": "llm_judge",
          "weight": 0.166667
        },
        {
          "criterion": "Agent provides a clear yes/no answer on whether Michael can cover all upcoming first-class flight upgrades from checking and savings without touching the credit card",
          "type": "llm_judge",
          "weight": 0.166667
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "counterfactual-f013",
    "app": "multi_app",
    "instruction": "Hypothetical: what if I self-funded the Threat Level Midnight sequel with a five grand budget? I'd pull from my brokerage dividends, the prediction-market balance, and savings. Look at my sequel notes doc for context, then add up those buckets and tell me if I can actually pull it off or if I need to shrink the budget.",
    "apps_involved": [
      "Files",
      "BatBucks",
      "OddsMarket",
      "Gringotts"
    ],
    "category": "counterfactual",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Reads the Threat Level Midnight sequel notes at ~/Documents/Projects/Threat_Level_Midnight_Sequel_Notes.txt for context",
          "type": "llm_judge",
          "weight": 0.131579
        },
        {
          "criterion": "Agent sums the 3 self-funding buckets named in the instruction (brokerage dividends, OddsMarket/prediction-market balance, Gringotts savings) using the live app values for the Threat Level Midnight sequel budget; may optionally include uninvested BatBucks cash if surfaced explicitly",
          "type": "llm_judge",
          "weight": 0.219299
        },
        {
          "criterion": "Agent computes the Threat Level Midnight sequel shortfall against the budget target stated in Michael's instruction (Michael says 'five grand' / $5000) \u2014 summed buckets vs that target \u2014 and surfaces it to the user",
          "type": "llm_judge",
          "weight": 0.175438
        },
        {
          "criterion": "Proposes either a smaller budget or selling BatBucks positions at cost basis to close the gap",
          "type": "llm_judge",
          "weight": 0.175438
        },
        {
          "criterion": "Explicitly flags that YTD brokerage dividends are a rounding error / immaterial, not a meaningful funding source against the budget target stated in Michael's instruction",
          "type": "llm_judge",
          "weight": 0.131579
        },
        {
          "criterion": "Agent makes a clear go/no-go call on whether Michael can self-fund the Threat Level Midnight sequel at the budget target stated in Michael's instruction (Michael says 'five grand' / $5000) with the listed buckets alone",
          "type": "llm_judge",
          "weight": 0.166668
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "counterfactual-f014",
    "app": "multi_app",
    "instruction": "Imagine the scenario where my NYC flight had been cancelled on the morning of departure and I had to bump the whole trip a day. What Dunder Mifflin stuff would I have missed that day across calendar events, project tasks, sales channel activity, and standing meetings? And what would it have cost me on the hotel if I lost a night?",
    "apps_involved": [
      "Dinoco Airlines",
      "HooliCalendar",
      "SprintBoard",
      "HooliWork",
      "HooliMail",
      "Cheskepdia"
    ],
    "category": "counterfactual",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent pulls the NYC Dinoco Airlines flight record and notes the original 7:30am scheduled NYC departure that would have slipped a day under the bump scenario",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Opens HooliCalendar UI and lists events on the bumped NYC-flight day (the original seeded departure day)",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Opens SprintBoard UI and enumerates tasks due around that day in Team Morale Initiative Q2",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Opens HooliWork and reports whether the sales-pipeline channel shows any activity on the bumped NYC-flight day (acceptable to note absence if no messages that day)",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent inspects HooliMail for standing meeting invites on that day that the trip bump would clash with (acceptable to note absence if no invites that day)",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent computes the Greenwich Hotel cost consequence from the NYC flight bump as 1 night lost out of 3 nights (per-night cost derived from the Cheskepdia booking total)",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent produces a ranked list of missed Dunder Mifflin obligations on the bumped NYC-flight day across calendar events, SprintBoard project tasks, and sales channel activity with impact notes",
          "type": "llm_judge",
          "weight": 0.16
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "cross_source_reconciliation"
  },
  {
    "id": "cua_only-f004",
    "app": "multi_app",
    "instruction": "Pull up my GME position in BatBucks and give me a report of the three-month chart, with a screenshot and analysis of the chart.",
    "apps_involved": [
      "BatBucks",
      "Files",
      "LibreOffice Writer"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent saves a PNG at ~/Desktop/gme_pnl.png that displays the BatBucks GME 3-month position chart as the main subject",
          "type": "llm_judge",
          "weight": 0.263067
        },
        {
          "criterion": "Agent writes ~/Desktop/gme_caption.txt containing at least 2-3 sentences of analysis describing the GME chart",
          "type": "llm_judge",
          "weight": 0.175467
        },
        {
          "criterion": "Caption describes the visual trend based on the rendered chart (not fabricated)",
          "type": "llm_judge",
          "weight": 0.2632
        },
        {
          "criterion": "Caption references GME as the subject of the chart",
          "type": "llm_judge",
          "weight": 0.1316
        },
        {
          "criterion": "Agent opens BatBucks at localhost:3002, navigates to the GME holding, and opens the 3-month chart view before capturing the screenshot",
          "type": "llm_judge",
          "weight": 0.166667
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "cua_only-f006",
    "app": "multi_app",
    "instruction": "Go into the Dundies email thread and find Toby's HR objection paragraph. Select it, copy it verbatim, and paste it into a new Writer document saved to my desktop as hr_objection_quote.odt.",
    "apps_involved": [
      "HooliMail",
      "LibreOffice Writer"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens the 'THE DUNDIES ARE BACK BABY' thread in HooliMail",
          "type": "llm_judge",
          "weight": 0.165049
        },
        {
          "criterion": "Agent finds Toby's HR objection paragraph specifically",
          "type": "llm_judge",
          "weight": 0.165049
        },
        {
          "criterion": "Agent selects the Toby HR objection paragraph in the HooliMail Dundies thread and copies it to the clipboard",
          "type": "llm_judge",
          "weight": 0.165049
        },
        {
          "criterion": "Agent opens LibreOffice Writer and creates a new empty document for the HR objection quote",
          "type": "llm_judge",
          "weight": 0.165049
        },
        {
          "criterion": "Agent pastes the copied Toby HR objection paragraph into the new LibreOffice Writer document verbatim",
          "type": "llm_judge",
          "weight": 0.165049
        },
        {
          "criterion": "Agent saves ~/Desktop/hr_objection_quote.odt containing the verbatim HR objection paragraph from Toby Flenderson",
          "type": "llm_judge",
          "weight": 0.174757
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "cua_only-f007",
    "app": "multi_app",
    "instruction": "Build me a Calc spreadsheet showing my spending over the most recent complete calendar quarter broken out by month and category with a percent-change column, pulled from my bank. Throw a bar chart on there and save it to my desktop.",
    "apps_involved": [
      "LibreOffice Calc",
      "Gringotts"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens Gringotts and browses transactions from the most recent complete calendar quarter (three most recent months) to source the Calc spreadsheet data",
          "type": "llm_judge",
          "weight": 0.166667
        },
        {
          "criterion": "Agent creates the LibreOffice Calc sheet with Month, Category, Total, and % Change columns pulled from Gringotts most-recent-quarter bank data",
          "type": "llm_judge",
          "weight": 0.166667
        },
        {
          "criterion": "Agent reconciles the Calc spreadsheet totals against the Gringotts most-recent-quarter charge sum reported by the app within a small rounding tolerance",
          "type": "llm_judge",
          "weight": 0.166667
        },
        {
          "criterion": "Agent inserts a bar chart of most-recent-quarter category totals into the LibreOffice Calc sheet at ~/Desktop/michael_spend_quarterly.ods",
          "type": "llm_judge",
          "weight": 0.166667
        },
        {
          "criterion": "Agent configures the inserted Calc bar chart and embeds it inside the spreadsheet rather than leaving the Insert Chart dialog open",
          "type": "llm_judge",
          "weight": 0.166667
        },
        {
          "criterion": "Agent saves the Calc spreadsheet in .ods format at ~/Desktop/michael_spend_quarterly.ods",
          "type": "llm_judge",
          "weight": 0.166667
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "cua_only-f009",
    "app": "multi_app",
    "instruction": "Go into the Party Planning Committee chat in HooliChat, scroll to a recent message you find useful, screenshot the message in context (showing it sits in the Party Planning Committee group), and drop the image on my desktop as ~/Desktop/chat_message_screenshot.png.",
    "apps_involved": [
      "HooliChat",
      "Files"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens HooliChat and navigates into the Party Planning Committee group chat",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent scrolls the Party Planning Committee group in HooliChat to bring a recent useful message into view",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent identifies a useful recent message in the Party Planning Committee group worth surfacing as the screenshot subject",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent verifies the chosen Party Planning Committee message is visible on screen and shows the group context (group name in header) before capturing",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent captures a screenshot of the HooliChat Party Planning Committee group showing the chosen message in context",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent saves ~/Desktop/chat_message_screenshot.png displaying the HooliChat Party Planning Committee group with the chosen recent message visible",
          "type": "llm_judge",
          "weight": 0.18
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "cua_only-f012",
    "app": "multi_app",
    "instruction": "Pull up my Sandals Montego Bay booking on Cheskepdia, scroll the page so everything loads, and take a full-page screenshot to my desktop.",
    "apps_involved": [
      "Cheskepdia",
      "Firefox",
      "Files"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens Cheskepdia at localhost:3012 in Firefox before navigating to the Sandals Montego Bay booking",
          "type": "llm_judge",
          "weight": 0.190909
        },
        {
          "criterion": "Agent navigates to the Sandals Montego Bay booking page on Cheskepdia before scrolling and capturing",
          "type": "llm_judge",
          "weight": 0.190909
        },
        {
          "criterion": "Agent scrolled the Sandals Montego Bay booking page to load all the details (photos, amenities, price breakdown) before capturing the screenshot (not just the initial viewport)",
          "type": "llm_judge",
          "weight": 0.190909
        },
        {
          "criterion": "Screenshot shows the Sandals Montego Bay booking page with its confirmation number or the live check-in/check-out dates as rendered in Cheskepdia",
          "type": "llm_judge",
          "weight": 0.190909
        },
        {
          "criterion": "Agent saves ~/Desktop/jamaica_booking.png showing the Cheskepdia Sandals Montego Bay booking page with the hotel total visible",
          "type": "llm_judge",
          "weight": 0.236364
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "cua_only-f013",
    "app": "multi_app",
    "instruction": "Open Firefox, use my HooliShop bookmark to jump over there, and open up my most recent order. Use Print to Save as PDF and drop it into my Downloads folder as latest_order.pdf.",
    "apps_involved": [
      "Firefox",
      "HooliShop"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens Firefox to begin the HooliShop print-to-PDF workflow",
          "type": "llm_judge",
          "weight": 0.125
        },
        {
          "criterion": "Agent clicks the HooliShop bookmark on the Firefox bookmark bar (not typed URL) to navigate to HooliShop",
          "type": "llm_judge",
          "weight": 0.142857
        },
        {
          "criterion": "Agent navigates to HooliShop Orders and opens the most recent order",
          "type": "llm_judge",
          "weight": 0.160714
        },
        {
          "criterion": "Agent opens Firefox File > Print (Ctrl+P) dialog from the most-recent HooliShop order page",
          "type": "llm_judge",
          "weight": 0.142857
        },
        {
          "criterion": "Agent selects 'Save to PDF' as the destination in the Firefox Print dialog for the HooliShop order",
          "type": "llm_judge",
          "weight": 0.142857
        },
        {
          "criterion": "Agent sets the filename to latest_order.pdf and the save destination to ~/Downloads/ in the Firefox Save as PDF dialog",
          "type": "llm_judge",
          "weight": 0.142857
        },
        {
          "criterion": "Agent confirms the save so latest_order.pdf is rendered to ~/Downloads/latest_order.pdf",
          "type": "llm_judge",
          "weight": 0.142857
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "cua_only-f014",
    "app": "multi_app",
    "instruction": "Go into my credit card view on Gringotts and find my 5 biggest charges of the year.",
    "apps_involved": [
      "Gringotts"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens Gringotts and navigates to the credit card transactions view for the current year",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent identifies and lists the 5 largest credit card charges of the year from the live Gringotts card transaction list, ranked by dollar amount",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent reports the payee/merchant name and amount for each of the 5 biggest charges (not fabricated \u2014 sourced from the live card view)",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent captures a screenshot showing the Gringotts credit card transaction list with the top charges visible and saves it to ~/Desktop/largest_txn.png",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent completes the full top-5 biggest charges retrieval (all 5 named) before ending the task",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "personal_lookup"
  },
  {
    "id": "cua_only-f018",
    "app": "multi_app",
    "instruction": "Pull every reply I've written to Jan (all of them, scattered around my mailbox) and copy each one into a single Writer document on my desktop as jan_reply_history.odt so I can see the pattern of how I've been defending these expense reports.",
    "apps_involved": [
      "HooliMail",
      "LibreOffice Writer"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent locates Michael's sent messages to Jan Levinson in HooliMail (either jan.levinson@dundermifflin.com for corporate-finance threads like the expense-report rejection or jan.levinson@serenitybyjan.com for personal threads)",
          "type": "llm_judge",
          "weight": 0.221088
        },
        {
          "criterion": "Agent copies the body of every sent reply Michael wrote to Jan Levinson into ~/Desktop/jan_reply_history.odt in the Writer document",
          "type": "llm_judge",
          "weight": 0.309456
        },
        {
          "criterion": "Agent inserts horizontal rules or visible delimiters between each pasted reply to Jan in ~/Desktop/jan_reply_history.odt",
          "type": "llm_judge",
          "weight": 0.176848
        },
        {
          "criterion": "File saved at ~/Desktop/jan_reply_history.odt",
          "type": "llm_judge",
          "weight": 0.132608
        },
        {
          "criterion": "Agent opens LibreOffice Writer and creates a new empty document to receive the pasted Jan Levinson reply history before pasting any content",
          "type": "llm_judge",
          "weight": 0.16
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "cua_only-f022",
    "app": "multi_app",
    "instruction": "Go into HangryDash and build my usual Sweetgreen order, but stop at the cart preview. Do NOT submit it. Pull it up so I can confirm it's exactly right before I pull the trigger.",
    "apps_involved": [
      "HangryDash",
      "Files"
    ],
    "category": "situated_action",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens HangryDash at localhost:3007 and navigates into the Sweetgreen storefront",
          "type": "llm_judge",
          "weight": 0.132075
        },
        {
          "criterion": "Agent adds a Sweetgreen item to the HangryDash cart that matches Michael's actual recent Sweetgreen order history (any item previously ordered is acceptable)",
          "type": "llm_judge",
          "weight": 0.301886
        },
        {
          "criterion": "Agent navigates to the HangryDash Sweetgreen cart preview screen",
          "type": "llm_judge",
          "weight": 0.132075
        },
        {
          "criterion": "Agent declines to submit or place the HangryDash Sweetgreen order and stops at the cart preview state",
          "type": "llm_judge",
          "weight": 0.150943
        },
        {
          "criterion": "Agent captures a screenshot of the HangryDash Sweetgreen cart preview state showing the chosen Sweetgreen item",
          "type": "llm_judge",
          "weight": 0.150943
        },
        {
          "criterion": "Agent saves the cart preview screenshot to ~/Desktop/cart_preview.png",
          "type": "llm_judge",
          "weight": 0.132075
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "cua_only-f023",
    "app": "multi_app",
    "instruction": "Draft me a new LockedIn post about 'Somehow I Manage' \u2014 two or three paragraphs about leadership. Get it fully typed into the Start a post composer so I can see what it looks like before I send it. Do NOT click Post. I want to look at it first.",
    "apps_involved": [
      "LockedIn",
      "Files"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Screenshot shows the LockedIn post composer (Start a post box opened) with the agent's typed draft visible",
          "type": "llm_judge",
          "weight": 0.289457
        },
        {
          "criterion": "Composer draft content references the 'Somehow I Manage' theme (the phrase appears in the typed body)",
          "type": "llm_judge",
          "weight": 0.2068
        },
        {
          "criterion": "Post was NOT actually published (LOCKEDIN_POST_COUNT should be unchanged)",
          "type": "llm_judge",
          "weight": 0.2068
        },
        {
          "criterion": "File saved at ~/Desktop/lockedin_preview.png",
          "type": "llm_judge",
          "weight": 0.082657
        },
        {
          "criterion": "Agent opens LockedIn signed in as Michael and clicks the Start a post composer before drafting the 'Somehow I Manage' leadership post",
          "type": "llm_judge",
          "weight": 0.071429
        },
        {
          "criterion": "Agent types 2-3 paragraphs of leadership content referencing 'Somehow I Manage' into the LockedIn composer body",
          "type": "llm_judge",
          "weight": 0.071429
        },
        {
          "criterion": "Agent screenshots the composer with the full draft visible (no need to click Post or any Preview button \u2014 the composer view itself is the deliverable)",
          "type": "llm_judge",
          "weight": 0.071429
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f011",
    "app": "multi_app",
    "instruction": "Book me a round-trip eTaxi for my Jamaica trip. I need a ride from my place to AVP in the wee hours of departure day and a return ride back from AVP when I land. Go with the cheapest option that has a driver wait under five minutes. While you're in eTaxi, add two new saved locations for me: the Dundies venue (1901 Mulberry St, Scranton, PA 18510) and the improv academy (its address is on the calendar event location).",
    "apps_involved": [
      "eTaxi",
      "HooliCalendar"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent attempts to schedule an outbound ride to AVP for the live Jamaica flight departure date, or notes that the departure date has already passed and proceeds with the return-leg booking",
          "type": "llm_judge",
          "weight": 0.151
        },
        {
          "criterion": "Agent picks the cheapest ride option whose driver ETA is under 5 minutes",
          "type": "llm_judge",
          "weight": 0.116
        },
        {
          "criterion": "Agent schedules return ride from AVP on the live Jamaica return date",
          "type": "llm_judge",
          "weight": 0.133
        },
        {
          "criterion": "Agent adds 'Dundies Venue' as a saved location using any reasonable Scranton address as a placeholder (the venue address is not seeded anywhere; agent should not refuse or block on missing data \u2014 a labeled placeholder address is the correct answer)",
          "type": "llm_judge",
          "weight": 0.107
        },
        {
          "criterion": "Agent adds 'Scranton Improv Academy' as a saved location, ideally pulling the address from the seeded HooliCalendar improv class event's location field (any reasonable Scranton address still earns credit if the calendar event is unreachable)",
          "type": "llm_judge",
          "weight": 0.107
        },
        {
          "criterion": "Agent verifies the round-trip eTaxi bookings and 2 new saved locations (Dundies Venue, Scranton Improv Academy) persist after the booking flow, citing Michael's 6am Jamaica outbound flight anchor",
          "type": "llm_judge",
          "weight": 0.116
        },
        {
          "criterion": "Agent sets the eTaxi pickup address to 1725 Slough Ave, Apt 4B and the destination to AVP before scheduling the outbound Jamaica trip ride",
          "type": "llm_judge",
          "weight": 0.09
        },
        {
          "criterion": "Agent confirms the outbound eTaxi booking from Slough Ave to AVP on the Jamaica departure day, or documents that the departure date has already passed and focuses on the return-leg booking",
          "type": "llm_judge",
          "weight": 0.09
        },
        {
          "criterion": "Agent confirms the return eTaxi booking from AVP back to 1725 Slough Ave for the live Jamaica return day",
          "type": "llm_judge",
          "weight": 0.09
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f022",
    "app": "multi_app",
    "instruction": "Check me in for my next upcoming Dinoco Airlines flight that has check-in open right now, and save the boarding pass to my Downloads folder so I can find it fast in the morning. While you're in my profile, take a screenshot of the loyalty / passenger-profile page so I have a record of my current Dinoco loyalty tier.",
    "apps_involved": [
      "Dinoco Airlines",
      "Files"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent locates Michael's next upcoming Dinoco Airlines flight whose check-in window is currently open OR that is already checked in (a flight with checked_in=1 and an available boarding pass also satisfies this criterion)",
          "type": "llm_judge",
          "weight": 0.113029
        },
        {
          "criterion": "Agent verifies the boarding pass is downloadable / viewable for the checked-in flight (e.g., a PDF or boarding-pass page opens without error) before saving it to ~/Downloads",
          "type": "llm_judge",
          "weight": 0.173905
        },
        {
          "criterion": "Agent downloads the boarding pass to ~/Downloads",
          "type": "llm_judge",
          "weight": 0.147736
        },
        {
          "criterion": "Agent navigates to the Dinoco loyalty / passenger-profile page and captures a screenshot showing Michael's live Dinoco loyalty tier",
          "type": "llm_judge",
          "weight": 0.156459
        },
        {
          "criterion": "Agent saves the loyalty / passenger-profile screenshot into the user's Downloads or Desktop folder for record-keeping",
          "type": "llm_judge",
          "weight": 0.130475
        },
        {
          "criterion": "Agent verifies the Dinoco boarding pass file landed in ~/Downloads and the loyalty / profile screenshot is also saved",
          "type": "llm_judge",
          "weight": 0.139198
        },
        {
          "criterion": "Agent opens Dinoco Airlines and navigates to the check-in flow (or boarding pass page) before completing check-in or retrieving an existing boarding pass",
          "type": "llm_judge",
          "weight": 0.139198
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f026",
    "app": "multi_app",
    "instruction": "I want to do a proper pass across all three of my SprintBoard projects (Team Morale, Diversity Day, and Movie Monday). Go through and apply appropriate labels to the open tasks in each project. Rewrite the descriptions on two of them that look stale, and leave a comment on the tasks that are sitting in review. Use the project briefs in my Documents for context so the labels and notes actually make sense.",
    "apps_involved": [
      "SprintBoard",
      "Files"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent adds labels to tasks in Team Morale Initiative Q2",
          "type": "llm_judge",
          "weight": 0.115402
        },
        {
          "criterion": "Agent adds labels to tasks in Diversity Day Planning 2026",
          "type": "llm_judge",
          "weight": 0.100015
        },
        {
          "criterion": "Agent adds labels to tasks in Movie Monday Program",
          "type": "llm_judge",
          "weight": 0.100015
        },
        {
          "criterion": "Agent rewrites 2 stale SprintBoard task descriptions across the three projects and saves the edits",
          "type": "llm_judge",
          "weight": 0.130789
        },
        {
          "criterion": "Agent posts a comment on each SprintBoard task currently sitting in review state across the three projects",
          "type": "llm_judge",
          "weight": 0.15387
        },
        {
          "criterion": "Agent applies SprintBoard labels to multiple open tasks across the Team Morale, Diversity Day, and Movie Monday projects, persisting after refresh",
          "type": "llm_judge",
          "weight": 0.092322
        },
        {
          "criterion": "Agent references the SprintBoard project brief docs in ~/Documents for context when picking labels and writing comments across the three projects",
          "type": "llm_judge",
          "weight": 0.076781
        },
        {
          "criterion": "Agent identifies the SprintBoard tasks currently sitting in the review state across the three projects before commenting on each one",
          "type": "llm_judge",
          "weight": 0.076935
        },
        {
          "criterion": "Agent verifies the SprintBoard labels, 2 rewritten task descriptions, and review-state comments persist on the Team Morale, Diversity Day, and Movie Monday projects after a refresh",
          "type": "llm_judge",
          "weight": 0.076935
        },
        {
          "criterion": "Agent opens the three SprintBoard projects (Team Morale Initiative Q2, Diversity Day Planning 2026, Movie Monday Program) before applying labels across them",
          "type": "llm_judge",
          "weight": 0.076935
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f027",
    "app": "multi_app",
    "instruction": "Refresh my LockedIn presence: update my profile headline so it actually says what I do these days (something like 'Regional Manager | Author of Somehow I Manage' \u2014 use your judgment, but make it match my voice and current role). Then compose a new LockedIn post pulling an excerpt from chapter one of Somehow I Manage, which I've got in my Documents folder.",
    "apps_involved": [
      "LockedIn",
      "Files"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent navigates to the LockedIn profile editor and locates the headline field",
          "type": "llm_judge",
          "weight": 0.12124
        },
        {
          "criterion": "Agent updates the LockedIn profile headline to a value that reflects Michael's current Regional Manager role and his Somehow I Manage authorship (in Michael's voice, not generic boilerplate)",
          "type": "llm_judge",
          "weight": 0.16155
        },
        {
          "criterion": "Agent opens ~/Documents/Somehow_I_Manage_Chapter_1.txt",
          "type": "llm_judge",
          "weight": 0.12124
        },
        {
          "criterion": "Agent composes and publishes a LockedIn post containing an excerpt",
          "type": "llm_judge",
          "weight": 0.177674
        },
        {
          "criterion": "Agent confirms the new LockedIn post and the updated profile headline persist after a page refresh",
          "type": "llm_judge",
          "weight": 0.12124
        },
        {
          "criterion": "Agent's LockedIn post quotes an accurate excerpt from Somehow I Manage chapter one (faithful to the source text, not a paraphrase)",
          "type": "llm_judge",
          "weight": 0.064496
        },
        {
          "criterion": "Agent saves the LockedIn profile headline change so it persists on the live profile view",
          "type": "llm_judge",
          "weight": 0.077519
        },
        {
          "criterion": "Agent extracts a 1-2 paragraph excerpt from ~/Documents/Somehow_I_Manage_Chapter_1.txt before pasting it into the LockedIn post composer",
          "type": "llm_judge",
          "weight": 0.077519
        },
        {
          "criterion": "Agent adds a short lead-in line in the LockedIn post composer above the Somehow I Manage chapter one excerpt before publishing the post",
          "type": "llm_judge",
          "weight": 0.077519
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "long_horizon-f006",
    "app": "multi_app",
    "instruction": "Can you get my Team Morale Initiative Q2 board cleaned up and back in the green? There's a pile of open tasks just sitting there, some definitely need owners assigned. Pull up any Dundies/Morale project notes I have under ~/Documents so you're working off the real goals, then draft a short status + blockers document. When you're done, post a retro summary on the HooliWork #general channel and drop a quick win-update in a HooliChat team group (the Happy Hour at Poor Richards group is the go-to).",
    "apps_involved": [
      "SprintBoard",
      "LibreOffice",
      "HooliWork",
      "HooliChat"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent enumerated the open Team Morale Initiative Q2 SprintBoard tasks",
          "weight": 0.157643,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent (re)assigned backlog tasks to appropriate team owners",
          "weight": 0.157851,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent referenced any relevant Dundies / Morale project notes under ~/Documents",
          "weight": 0.105373,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent drafted a short project status + blockers document in LibreOffice",
          "weight": 0.157851,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent posted a HooliWork #general retro summary",
          "weight": 0.157851,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent posted a status update to the Happy Hour at Poor Richards (or equivalent seeded team) HooliChat group",
          "weight": 0.105373,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent completed the Team Morale Initiative Q2 cleanup subtasks end-to-end (SprintBoard review, owner (re)assignment, docs referenced, status document drafted, HooliWork retro posted, HooliChat update posted)",
          "weight": 0.158059,
          "type": "llm_judge"
        }
      ],
      "rubric_rule": "All-or-nothing rubric enforcement: missing any subtask caps rubric score at 60%.",
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f010",
    "app": "multi_app",
    "instruction": "I'm finally going to act like a responsible adult with my BatBucks account. Pull up my holdings, sell half of my GME and half of my RIVN (I know, I know, paper hands), and use the proceeds plus my free cash to buy more Meta like a normal person. Update my watchlist and write up a rationale memo in Documents so I can pretend this was strategic. Then post a tastefully smug update to the Finer Things Club chat.",
    "apps_involved": [
      "BatBucks",
      "LibreOffice",
      "HooliChat"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent enumerated all BatBucks holdings, listing shares and average cost per position, before the GME and RIVN sells",
          "weight": 0.086223,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent executed or simulated selling half of GME and half of RIVN",
          "weight": 0.172445,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent executed or simulated a Meta (META) buy with the proceeds plus free cash",
          "weight": 0.129334,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent updated the BatBucks watchlist after the GME/RIVN sell and Meta buy",
          "weight": 0.086223,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent wrote a rationale memo in Documents/ explaining the rebalance into Meta",
          "weight": 0.172445,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent posted a Finer Things Club HooliChat message referencing the portfolio move",
          "weight": 0.129334,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent completed all BatBucks rebalance subtasks (holdings listed, portfolio value computed, approximately half of the GME position sold, approximately half of the RIVN position sold, Meta buy, watchlist updated, Documents/ memo, Finer Things Club HooliChat)",
          "weight": 0.086406,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent computes the BatBucks current cost-basis portfolio value (from holdings) and notes the free cash balance before sizing the GME and RIVN sells",
          "weight": 0.137589,
          "type": "llm_judge"
        }
      ],
      "rubric_rule": "All-or-nothing rubric enforcement: missing any subtask caps rubric score at 60%.",
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f011",
    "app": "multi_app",
    "instruction": "I've been dragging my feet on my taxes and I need you to finally finish my most-recent tax return in SpeedTax. The Dunder Mifflin W-2 numbers and the freelance consulting 1099 are both sitting in the Tax_2025 folder in my Documents. Plug them all in. Don't forget my charitable donations and the days I worked from home, then save the draft.",
    "apps_involved": [
      "SpeedTax",
      "LibreOffice",
      "Gringotts"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens SpeedTax most-recent return and verifies its current status (in_progress / review / not_submitted are all acceptable \u2014 task is to finish the return)",
          "weight": 0.1,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent entered the Dunder Mifflin W-2 numbers into SpeedTax exactly as written in Documents/Tax_2025/w2_summary.txt (gross / federal withholding / state withholding)",
          "weight": 0.2,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent entered the 1099-NEC freelance consulting payer and amount into SpeedTax exactly as listed in Documents/Tax_2025/1099s.txt",
          "weight": 0.15,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent entered the home office WFH days count into SpeedTax from Documents/Tax_2025/tax_worksheet.txt, and either entered any charitable deductions listed in the seeded tax files or noted that no charitable line is present in the seed",
          "weight": 0.2,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent referenced Documents/Tax_2025/ source files",
          "weight": 0.1,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent filed or saved the SpeedTax draft and logged completion",
          "weight": 0.15,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent completed all seven SpeedTax return subtasks: W-2, 1099, charitable, home office days, calculations, and draft save",
          "weight": 0.1,
          "type": "llm_judge"
        }
      ],
      "rubric_rule": "All-or-nothing rubric enforcement: missing any subtask caps rubric score at 60%.",
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "long_horizon-f012",
    "app": "multi_app",
    "instruction": "My OddsMarket account is hemorrhaging money and I need you to stop the bleeding. Pull up all my open positions and find the one I have absolutely no business betting on, meaning the one farthest from my actual expertise. Close that one out. Reinvest everything into the Office reboot bet, because that is literally my area of expertise. Write up the strategy in a Documents memo, and post a hyped-up message in the Threat Level Midnight Fan Club chat explaining why the Office bet is the smart play.",
    "apps_involved": [
      "OddsMarket",
      "LibreOffice",
      "HooliChat"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent enumerated every open OddsMarket position with shares and YES/NO sides (report whatever the live count is; no assumed count)",
          "weight": 0.149968,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent identified the OddsMarket position farthest from Michael's expertise and explained why it is the one to close",
          "weight": 0.149968,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent sold the flagged OddsMarket position and bought into the Office reboot YES market",
          "weight": 0.199957,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent wrote a strategy memo citing the agent-computed baseline OddsMarket PnL",
          "weight": 0.149968,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent posted a HooliChat Threat Level Midnight Fan Club message",
          "weight": 0.149968,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent delivered a final summary listing the OddsMarket position closed and the Office reboot position opened",
          "weight": 0.099979,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent completed all seven OddsMarket bleeding-stop subtasks: list, identify, close, re-deploy, memo, chat post, summary",
          "weight": 0.100191,
          "type": "llm_judge"
        }
      ],
      "rubric_rule": "All-or-nothing rubric enforcement: missing any subtask caps rubric score at 60%.",
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f014",
    "app": "multi_app",
    "instruction": "I want to know how my spending has changed from last year to this year, no sugarcoating. Go into my bank, pull both years of totals, and break them down by category. Do the same on the income side from my tax return for context. Identify the biggest movers and tell me where I'm leaking cash. Save a findings report in Documents and draft me a journal email to myself summarizing it.",
    "apps_involved": [
      "Gringotts",
      "SpeedTax",
      "LibreOffice",
      "HooliMail"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent cites Gringotts spending totals for two comparable windows (e.g., prior full calendar year vs current year-to-date, or a trailing 12-month vs prior 12-month slice) using whichever windows have enough seeded transactions to support the comparison",
          "weight": 0.129334,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent groups Gringotts transactions into top spending categories for each compared window and lists them",
          "weight": 0.172445,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent compares Gringotts spending across at least 3 categories that have multiple transactions in each compared window",
          "weight": 0.172445,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent saved a findings report to Documents/",
          "weight": 0.172445,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent sent a self-journal HooliMail summarizing the Gringotts spending findings",
          "weight": 0.129334,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent completed all seven Gringotts spending-change subtasks: totals, categories, YoY delta, movers, report, HooliMail, summary",
          "weight": 0.086406,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent identifies the 3 biggest Gringotts category movers between the compared windows and names where Michael is leaking cash",
          "weight": 0.117589,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent closes with a final summary that includes a concrete actionable next step (e.g., cancel a subscription, cap a category, set a savings target) rather than just restating the numbers",
          "weight": 0.02,
          "type": "llm_judge"
        }
      ],
      "rubric_rule": "All-or-nothing rubric enforcement: missing any subtask caps rubric score at 60%.",
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f015",
    "app": "multi_app",
    "instruction": "I want to run a real Threat Level Midnight 2 cast & character workshop. Using my known improv alter egos (Prison Mike, Date Mike, Caleb Crawdad, etc.) and the TLM sequel notes in ~/Documents/Projects/Threat_Level_Midnight_Sequel_Notes.txt for context, figure out which of my alter egos can plausibly land roles in TLM2 alongside Michael Scarn. For each surviving character, write a one-paragraph TLM2 backstory in LibreOffice. Save the character bible under ~/Documents/Projects/. Block a Saturday-afternoon workshop on HooliCalendar so I can rehearse the new voices in front of a mirror. Drop a 'casting underway' tease in the HooliChat Threat Level Midnight Fan Club. Pre-fill a Kwik-E-Mart pickup with snacks and drinks for the workshop (chips, soda \u2014 character-work fuel). Wrap the character bible up as a LibreOffice Impress deck (one slide per character with the alter ego name + TLM2 backstory paragraph) so I can rehearse straight from the slides.",
    "apps_involved": [
      "LibreOffice Writer",
      "Files",
      "HooliCalendar",
      "HooliChat",
      "Kwik-E-Mart",
      "LibreOffice Impress"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent enumerates Michael's improv alter-ego roster (Prison Mike, Date Mike, Caleb Crawdad \u2014 at least 3 named alter egos) and references the TLM sequel notes file",
          "type": "llm_judge",
          "weight": 0.163636
        },
        {
          "criterion": "Agent decides which characters can plausibly land roles in TLM2 alongside Michael Scarn (with reasoning per character)",
          "type": "llm_judge",
          "weight": 0.136364
        },
        {
          "criterion": "Agent writes a one-paragraph TLM2 backstory per surviving character in LibreOffice",
          "type": "llm_judge",
          "weight": 0.181818
        },
        {
          "criterion": "Agent saves the character bible under ~/Documents/Projects/",
          "type": "llm_judge",
          "weight": 0.136364
        },
        {
          "criterion": "Agent blocks a Saturday-afternoon workshop on HooliCalendar",
          "type": "llm_judge",
          "weight": 0.090909
        },
        {
          "criterion": "Agent drops a 'casting underway' tease in the HooliChat Threat Level Midnight Fan Club",
          "type": "llm_judge",
          "weight": 0.109091
        },
        {
          "criterion": "Agent pre-fills a Kwik-E-Mart pickup with workshop snacks and drinks (e.g. chips/cola/Buzz Cola/Squishee or similar \u2014 the items available in the Kwik-E-Mart catalog)",
          "type": "llm_judge",
          "weight": 0.090909
        },
        {
          "criterion": "Agent wraps the character bible as a LibreOffice Impress deck \u2014 one slide per surviving alter ego (Prison Mike, Date Mike, etc.) with the TLM2 backstory paragraph as slide body",
          "type": "llm_judge",
          "weight": 0.090909
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f019",
    "app": "multi_app",
    "instruction": "I have ordered delivery an embarrassing number of times and it's time to give back. Pull my top five most-ordered restaurants from HangryDash and leave each of them a thoughtful review. I have feelings about all of these places. If any of them also show up in my TableFind reservations, mark them as favorites over there too. Save the review list to Documents for my records.",
    "apps_involved": [
      "HangryDash",
      "TableFind",
      "LibreOffice"
    ],
    "category": "long_horizon",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent identified Michael's #1 HangryDash delivery restaurant (the top of the ranking, with an approximate lifetime order count)",
          "weight": 0.2,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent produced a defensible top-5 ranking by order count, accepting any reasonable tie-break among positions 2-5 when multiple restaurants have similar order counts",
          "weight": 0.15,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent left reviews on all 5 top restaurants",
          "weight": 0.25,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent marked overlapping TableFind favorites",
          "weight": 0.15,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent saved the review list to Documents/",
          "weight": 0.15,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent completed all six HangryDash delivery give-back subtasks: rank top 5, identify the #1 restaurant, leave reviews, TableFind favorites, Documents log, summary",
          "weight": 0.1,
          "type": "llm_judge"
        }
      ],
      "rubric_rule": "All-or-nothing rubric enforcement: missing any subtask caps rubric score at 60%.",
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f020",
    "app": "multi_app",
    "instruction": "All three of my projects are a mess and I need to get them in sync. Go into SprintBoard and walk through Team Morale, Diversity Day, and Movie Monday. For every open task, assign priorities, labels, and deadlines. Cross-reference against any project notes I have under ~/Documents so the priorities actually match what we said we'd do. Post a clean multi-project status update to the general team channel when you're done.",
    "apps_involved": [
      "SprintBoard",
      "LibreOffice",
      "HooliWork"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent enumerated the 3 SprintBoard projects (Team Morale, Diversity Day, Movie Monday) and listed the open tasks in each",
          "weight": 0.136388,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent read relevant project notes under ~/Documents (Team Morale / Diversity Day / Movie Monday project briefs, plus any Dundies context docs at ~/Documents root) before setting priorities",
          "weight": 0.136388,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent assigns priorities, labels, and deadlines to open SprintBoard tasks across all three projects (Team Morale, Diversity Day, Movie Monday)",
          "weight": 0.181851,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent sets priorities on every open SprintBoard task across Team Morale, Diversity Day, and Movie Monday",
          "weight": 0.136388,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent cross-references assigned SprintBoard priorities against the project notes under ~/Documents so the priorities match the stated project scope",
          "weight": 0.090926,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent posted HooliWork #general cross-project status",
          "weight": 0.136388,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent saved a cross-project summary file",
          "weight": 0.090926,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent completed the SprintBoard three-project sync subtasks end-to-end: enumerate, read project notes, assign owners, set priorities, HooliWork post, Documents summary",
          "weight": 0.090744,
          "type": "llm_judge"
        }
      ],
      "rubric_rule": "All-or-nothing rubric enforcement: missing any subtask caps rubric score at 60%.",
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f025",
    "app": "multi_app",
    "instruction": "The Threat Level Midnight Fan Club has been dormant and I am personally offended. Peek at the group chat to see how dead it actually is, then scroll through my LockedIn contacts for any Dunder Mifflin folks I should recruit in. Draft them an invitation email and get a watch party on my calendar for next month. Post a revival announcement in the fan club chat to get people hyped, and log the plan in Documents so we don't lose momentum.",
    "apps_involved": [
      "HooliChat",
      "LockedIn",
      "HooliMail",
      "HooliCalendar",
      "LibreOffice"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reviewed the HooliChat Threat Level Midnight Fan Club group",
          "weight": 0.149938,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent extracted recruit candidates from LockedIn Dunder Mifflin connections",
          "weight": 0.149938,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent writes a HooliMail Threat Level Midnight Fan Club revival invitation email to the Dunder Mifflin recruit candidates",
          "weight": 0.149938,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent scheduled a watch party event in HooliCalendar",
          "weight": 0.149938,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent posted a HooliChat revival announcement",
          "weight": 0.149938,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent saved the Threat Level Midnight Fan Club revival plan to a Documents/ file",
          "weight": 0.149938,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent completed all seven TLM Fan Club revival subtasks: review chat, list recruits, HooliMail invite, HooliCalendar watch party, revival post, Documents plan, summary",
          "weight": 0.100375,
          "type": "llm_judge"
        }
      ],
      "rubric_rule": "All-or-nothing rubric enforcement: missing any subtask caps rubric score at 60%.",
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f026",
    "app": "multi_app",
    "instruction": "I want to get a jump on holiday gifts this year and actually be the fun one in the office. Go through my HooliShop order history for past gifts that landed well and build me a shopping list with a wishlist. Cross-check my bank to see what kind of budget I can realistically spend before it gets scary. Draft a message to the Party Planning Committee chat about getting Secret Santa going, and save the whole plan to Documents.",
    "apps_involved": [
      "HooliShop",
      "Gringotts",
      "Firefox",
      "HooliChat",
      "LibreOffice"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent cited the HooliShop order history total (order count and cumulative spend)",
          "weight": 0.15,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent identifies at least three past HooliShop gift purchases from order history and lists them as candidates for the holiday office plan",
          "weight": 0.2,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent creates a HooliShop wishlist or shopping list entry containing holiday gift candidates for the office",
          "weight": 0.15,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent opens Gringotts and reads the live checking and savings balances to set a holiday gift budget",
          "weight": 0.15,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent writes a HooliChat message in the Party Planning Committee channel proposing a Secret Santa for the office holiday gift plan",
          "weight": 0.15,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent saves the holiday gift plan file under Documents/ with a filename referencing gifts, wishlist, or Secret Santa",
          "weight": 0.1,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent provides a final summary that references HooliShop orders, Gringotts budget, the Party Planning HooliChat message, and the Documents/ plan file",
          "weight": 0.1,
          "type": "llm_judge"
        }
      ],
      "rubric_rule": "All-or-nothing rubric enforcement: missing any subtask caps rubric score at 60%.",
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f028",
    "app": "multi_app",
    "instruction": "I want to actually be consistent on LockedIn for once, so I need four weeks of content built around the Somehow I Manage brand. Look at my current headline and what I've already posted so we build on it, then pull source quotes from my Somehow I Manage chapter one draft and write ten posts. Save the whole calendar in Documents and put reminders on my calendar so I actually post them. Go ahead and publish the first one live today to get the ball rolling.",
    "apps_involved": [
      "LockedIn",
      "LibreOffice",
      "HooliCalendar"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens LockedIn and reads the current headline plus Michael's existing posts (including any Somehow I Manage book reference if present) before drafting new content",
          "weight": 0.130435,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent used Somehow_I_Manage_Chapter_1.txt as source",
          "weight": 0.130435,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent drafted 10 distinct LockedIn posts",
          "weight": 0.217391,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent saves the LockedIn content calendar file to Documents/ containing the ten Somehow I Manage posts",
          "weight": 0.130435,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent created 10 HooliCalendar events (one per planned LockedIn post) as scheduled posting reminders",
          "weight": 0.130435,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent publishes the first Somehow I Manage LockedIn post live today and confirms it appears on the LockedIn feed",
          "weight": 0.173913,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent provides a final summary that references the LockedIn headline review, the Somehow I Manage source quotes, the ten drafted posts, and the HooliCalendar scheduled-posting events",
          "weight": 0.086957,
          "type": "llm_judge"
        }
      ],
      "rubric_rule": "All-or-nothing rubric enforcement: missing any subtask caps rubric score at 60%.",
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f029",
    "app": "multi_app",
    "instruction": "I want a serious, position-by-position review of every open bet I have on OddsMarket \u2014 read-only, no trades. For each one, use my Firefox bookmarks for any context I've been saving, assign a 1-5 domain-expertise score AND a 1-5 downside-risk score, and write me an honest analysis of whether I actually know the market or I'm just gambling. Save the whole write-up as a LibreOffice Calc scorecard (columns: position / expertise / downside / net rank / thesis) in Documents. Then post a short 'this-quarter portfolio self-assessment \u2014 keeping all positions but sharpening my thesis' update on LockedIn \u2014 do NOT close or buy anything.",
    "apps_involved": [
      "OddsMarket",
      "Firefox",
      "LibreOffice",
      "LockedIn"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent lists every active OddsMarket YES position with its live share count (not a fabricated or memorized list)",
          "weight": 0.13,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent opens Firefox bookmarks and either cites relevant context or clearly notes that bookmarks contain only app shortcuts (no saved research sources)",
          "weight": 0.1,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent assigns both a 1-5 domain-expertise score AND a 1-5 downside-risk score per position, distinct axes",
          "weight": 0.18,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent writes a per-position thesis paragraph for each OddsMarket open bet explaining the market and stake",
          "weight": 0.17,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent ranks positions by net score (expertise minus downside), not by raw conviction",
          "weight": 0.1,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent saves the scorecard as a LibreOffice Calc spreadsheet (columns: position / expertise / downside / net rank / thesis) under ~/Documents/",
          "weight": 0.12,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent posts a 'this-quarter portfolio self-assessment \u2014 keeping all positions but sharpening my thesis' update on LockedIn (NOT on HooliChat)",
          "weight": 0.1,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent explicitly refrains from closing or buying any OddsMarket position during the review (read-only workflow preserved)",
          "weight": 0.1,
          "type": "llm_judge"
        }
      ],
      "rubric_rule": "All-or-nothing rubric enforcement: missing any subtask caps rubric score at 60%.",
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f037",
    "app": "multi_app",
    "instruction": "I want to write a Cheskepdia review for that Philadelphia overnight Holly and I did \u2014 the little stay at the Radisson Blu Warwick downtown. Pull up the booking, write a thoughtful 4-or-5-star review, save a draft locally in LibreOffice, and post a quick LockedIn 'date night Scranton-to-Philly' write-up so my network sees Holly is back in the picture.",
    "apps_involved": [
      "Cheskepdia",
      "LockedIn",
      "LibreOffice"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent pulls the recent Philadelphia overnight booking from Cheskepdia (Radisson Blu Warwick Hotel) with the confirmation number as shown live in Cheskepdia",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent writes a Cheskepdia review (4 or 5 stars) for that specific Philadelphia stay",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent saves a draft of the Philadelphia review locally as a LibreOffice document (any reasonable local path)",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent posts a short LockedIn 'date night Scranton-to-Philly' write-up that mentions Holly",
          "type": "llm_judge",
          "weight": 0.2
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f038",
    "app": "multi_app",
    "instruction": "I need a solo-prep Q2 Sales kickoff outline JUST FOR ME \u2014 notes-to-self only, no client outreach or team-wide announcements yet (that comes later). Pull the Q2 items from SprintBoard, skim recent thread decisions in HooliWork #sales for context (but don't post), block a private 'Q2 kickoff prep' focus-hour on my HooliCalendar (single 1-hour block, no attendees), and assemble a 1-page kickoff outline in LibreOffice Writer saved under ~/Documents/ (any subfolder is fine \u2014 Q2_kickoff_prep.odt or similar). Save a draft-only HooliMail kickoff email to myself (NOT to the team) so I can polish it later. Do NOT post anything in #sales, do NOT email the team, do NOT invite anyone to the calendar block.",
    "apps_involved": [
      "SprintBoard",
      "HooliWork",
      "Gringotts",
      "HooliCalendar",
      "HooliMail",
      "LibreOffice"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent pulls Q2 SprintBoard items across Michael's active projects (read-only; no new tasks created)",
          "weight": 0.16,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent skims HooliWork #sales for recent thread decisions WITHOUT posting a new message there",
          "weight": 0.14,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent blocks a private 1-hour 'Q2 kickoff prep' focus-hour on HooliCalendar with NO attendees (solo, not a team event)",
          "weight": 0.16,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent saves a 1-page LibreOffice Writer kickoff outline under ~/Documents/ (single file named Q2_kickoff_prep.odt or similar, not a packet)",
          "weight": 0.18,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent saves a DRAFT-only HooliMail kickoff email addressed to self (or left unsent) \u2014 NOT sent to the sales team",
          "weight": 0.18,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent explicitly refrains from posting in HooliWork #sales, emailing the team, or adding attendees to the calendar block (solo-prep discipline preserved)",
          "weight": 0.1,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent provides a final summary characterizing this as a private prep pass (not the public Q2 push launch)",
          "weight": 0.08,
          "type": "llm_judge"
        }
      ],
      "rubric_rule": "All-or-nothing rubric enforcement: missing any subtask caps rubric score at 60%.",
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f039",
    "app": "multi_app",
    "instruction": "I want to give myself a five-day window to finally clean up my email. Go through my inbox (the stuff I've received, not the stuff I've sent) and bucket everything into today, this week, reference, or ignore, then create a note with these categories counts. Then hit the top ten most urgent with actual drafted replies. Save the whole triage summary in Documents, and post a little victory lap note in the team general channel when it's all done.",
    "apps_involved": [
      "HooliMail",
      "LibreOffice",
      "HooliWork"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent enumerates the received-inbox count from HooliMail (or Maildir/cur/ grep) excluding From: michael.scott@dundermifflin.com",
          "weight": 0.2,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent produced a realistic 4-bucket triage",
          "weight": 0.15,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent identifies the top 10 urgent inbox messages from the the received-inbox count for the triage",
          "weight": 0.15,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent writes drafted replies for all 10 of the top urgent inbox messages identified in the triage",
          "weight": 0.2,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent saved triage summary to Documents/",
          "weight": 0.1,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent posted a HooliWork #general completion note",
          "weight": 0.1,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent provides a final summary with the inbox bucket counts for today, this week, reference, and ignore from the the received-inbox count",
          "weight": 0.1,
          "type": "llm_judge"
        }
      ],
      "rubric_rule": "All-or-nothing rubric enforcement: missing any subtask caps rubric score at 60%.",
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f040",
    "app": "multi_app",
    "instruction": "I want a full year-in-review retrospective for the past 12 months and I want to be the subject of it. Pull the headline numbers from all over: what I spent across the trailing 12 months, how many HooliShop and HangryDash and Kwik-E-Mart orders I made, where my investments landed, how my prediction bets actually went, what SprintBoard and the team channels looked like, what my LockedIn activity was, and where my tax return ended up. Compile it into a 12-month review outline in Documents that tells the story of my year, and draft a LockedIn post teasing it. Call it 'The Year of Somehow I Managed.' The actual headline-number rollup goes into LibreOffice Calc so I can sort/filter columns; the narrative summary can stay prose.",
    "apps_involved": [
      "Gringotts",
      "HooliShop",
      "HangryDash",
      "Kwik-E-Mart",
      "BatBucks",
      "OddsMarket",
      "SprintBoard",
      "HooliWork",
      "LockedIn",
      "SpeedTax",
      "LibreOffice Calc",
      "Files"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent cited at least 7 of 8 headline numbers accurately",
          "weight": 0.176394,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent pulls trailing-12-months year-in-review headline numbers from all 8 listed apps including HooliShop and HangryDash and Kwik-E-Mart",
          "weight": 0.141116,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent compiled a coherent outline saved to Documents/",
          "weight": 0.176394,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent writes a LockedIn post draft teasing 'The Year of Somehow I Managed' trailing-12-months retrospective",
          "weight": 0.10569,
          "type": "llm_judge"
        },
        {
          "criterion": "Final summary lists the 8 headline numbers",
          "weight": 0.070558,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent provides a final summary that references the 'Year of Somehow I Managed' outline, the LockedIn teaser draft, and the Documents/ save location",
          "weight": 0.035279,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent reports Gringotts trailing-12-months total spend (computed from transactions) and BatBucks at-cost portfolio total + OddsMarket PnL as headline numbers",
          "weight": 0.073498,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent reports SprintBoard open task counts across Team Morale / Diversity Day / Movie Monday and HooliWork message activity + top channels",
          "weight": 0.073498,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent reports LockedIn activity (posts + connections) and SpeedTax most-recent return status as part of the 'Year of Somehow I Managed' headline numbers",
          "weight": 0.073498,
          "type": "llm_judge"
        },
        {
          "criterion": "Agent saves the trailing-12-months headline-number rollup as a LibreOffice Calc spreadsheet (not Writer) so each app's quarter-by-quarter numbers are columnar / sortable",
          "type": "llm_judge",
          "weight": 0.074074
        }
      ],
      "rubric_rule": "All-or-nothing rubric enforcement: missing any subtask caps rubric score at 60%.",
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "preference_inference-f002",
    "app": "multi_app",
    "instruction": "For my Jamaica flight out of AVP, what time should I book the car for based on how much buffer I usually give myself? Go off my past airport rides, and check my HooliCalendar for any pre-flight travel block or airport-pickup reminder I've already set for the Jamaica departure.",
    "apps_involved": [
      "eTaxi",
      "Dinoco Airlines",
      "HooliCalendar"
    ],
    "category": "preference_inference",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent enumerates AVP-bound eTaxi rides and pairs them with Dinoco departures to compute lead times",
          "type": "llm_judge",
          "weight": 0.210558
        },
        {
          "criterion": "Agent derives a defensible 'minutes before departure' pattern from the available past AVP airport rides (accepting any consistent lead-time pattern as long as the reasoning ties the eTaxi pickup times to their matching Dinoco departures)",
          "type": "llm_judge",
          "weight": 0.210558
        },
        {
          "criterion": "Agent recommends a concrete pickup time for the Jamaica flight that matches the inferred pattern",
          "type": "llm_judge",
          "weight": 0.210558
        },
        {
          "criterion": "Agent reads HooliCalendar to check for any pre-flight travel block or airport car-pickup reminder before booking the Jamaica flight ride",
          "type": "llm_judge",
          "weight": 0.157768
        },
        {
          "criterion": "Agent references the past eTaxi airport ride records and Dinoco departures it read when recommending the car pickup time buffer for the Jamaica flight",
          "type": "llm_judge",
          "weight": 0.105179
        },
        {
          "criterion": "Agent states a specific car pickup time for the Jamaica flight based on the inferred airport ride buffer before finishing the recommendation",
          "type": "llm_judge",
          "weight": 0.105379
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "pattern_inference"
  },
  {
    "id": "preference_inference-f004",
    "app": "multi_app",
    "instruction": "Am I more of a dine-out guy or a delivery guy? Look at where I make reservations vs where I order delivery from, top 5 each, and tell me how I split.",
    "apps_involved": [
      "TableFind",
      "HangryDash"
    ],
    "category": "preference_inference",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent identifies Michael's top TableFind reservation spot as it appears in the live reservation counts (whichever restaurant has the highest count, or reports the tie if multiple are tied for top)",
          "type": "llm_judge",
          "weight": 0.210642
        },
        {
          "criterion": "Agent identifies Cooper's Seafood House as Michael's top HangryDash delivery restaurant based on the live order history counts",
          "type": "llm_judge",
          "weight": 0.210642
        },
        {
          "criterion": "Agent computes the overlap between the two top-5 lists explicitly",
          "type": "llm_judge",
          "weight": 0.210642
        },
        {
          "criterion": "Agent explains Michael's dine-in vs delivery split in clear prose",
          "type": "llm_judge",
          "weight": 0.157832
        },
        {
          "criterion": "Agent references the TableFind reservation counts and HangryDash delivery order counts it pulled when describing Michael's dine-out vs delivery split",
          "type": "llm_judge",
          "weight": 0.105221
        },
        {
          "criterion": "Agent reports both top-5 dine-out reservations and top-5 delivery restaurants with the split percentage before finishing the answer",
          "type": "llm_judge",
          "weight": 0.105021
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "pattern_inference"
  },
  {
    "id": "preference_inference-f012",
    "app": "multi_app",
    "instruction": "Look at my HangryDash order history over the last few months. Am I really sticking to Sweetgreen healthy lunches like I tell people, or am I quietly defaulting to places like Chili's? Pull the data and tell me what my food preferences actually look like.",
    "apps_involved": [
      "HangryDash"
    ],
    "category": "preference_inference",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens HangryDash and enumerates Michael's order history (multiple orders, not a single order)",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent computes order frequency per restaurant or per cuisine from the history (not just lists raw orders)",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent identifies Michael's actually-most-ordered HangryDash restaurants (top 2-3 by live order count), citing specific order counts per restaurant from the live data",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent gives a clear preference verdict on whether Michael is actually sticking to his stated 'healthy lunch' (Sweetgreen) narrative, backed by the live order-count distribution \u2014 calling out any gap between the stated preference and actual behavior",
          "type": "llm_judge",
          "weight": 0.25
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "pattern_inference"
  },
  {
    "id": "preference_inference-f018",
    "app": "multi_app",
    "instruction": "I'm clearly a GameStop believer: I've got shares AND a prediction-market bet on it. If I wanted to lean INTO that conviction instead of fighting it, what would a rebalance look like? Cost basis only, no live prices.",
    "apps_involved": [
      "BatBucks",
      "OddsMarket"
    ],
    "category": "preference_inference",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent surfaces the GME BatBucks position with shares and cost basis",
          "type": "llm_judge",
          "weight": 0.210642
        },
        {
          "criterion": "Agent surfaces the OddsMarket GameStop-above-$100 YES position",
          "type": "llm_judge",
          "weight": 0.210642
        },
        {
          "criterion": "Agent explicitly uses cost basis only (not live prices)",
          "type": "llm_judge",
          "weight": 0.157832
        },
        {
          "criterion": "Agent proposes a coherent rebalance that leans INTO the existing GME bias",
          "type": "llm_judge",
          "weight": 0.210642
        },
        {
          "criterion": "Agent references the BatBucks GME shares cost basis and OddsMarket GameStop bet it pulled when proposing the lean-in rebalance for Michael's GME conviction",
          "type": "llm_judge",
          "weight": 0.105221
        },
        {
          "criterion": "Agent reports a specific lean-INTO-GameStop rebalance using cost basis only before finishing the conviction-doubling proposal",
          "type": "llm_judge",
          "weight": 0.105021
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "pattern_inference"
  },
  {
    "id": "retrieval-f002",
    "app": "multi_app",
    "instruction": "Pull up my confirmation number for the Sandals reservation in Jamaica. I need it for the concierge and I can't find it anywhere. If you can find a corroborating copy in HooliMail or in the Jamaica trip planning doc in ~/Documents/Trips, mention it \u2014 but the Cheskepdia booking is the source of truth.",
    "apps_involved": [
      "Cheskepdia",
      "HooliMail",
      "Files"
    ],
    "category": "retrieval",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens Cheskepdia and locates the Sandals Montego Bay booking",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent reads off the Cheskepdia confirmation number for the Sandals Montego Bay Jamaica booking as shown live on the record",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent searches HooliMail for any Sandals Jamaica reservation confirmation email (if no such email exists, agent says so rather than fabricating)",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent opens the Jamaica trip planning doc (~/Documents/Trips/sandals_montego_bay_jamaica.txt) and corroborates the Sandals confirmation number against the Cheskepdia record for the concierge",
          "type": "llm_judge",
          "weight": 0.25
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "personal_lookup"
  },
  {
    "id": "retrieval-f005",
    "app": "multi_app",
    "instruction": "I'm pretty sure I'm paying for too many subscriptions. Can you pull up everything that auto-charges me every month and list each one with the amount? If you happen to find any subscription welcome or renewal emails in HooliMail along the way, factor those in \u2014 but Gringotts is the primary source.",
    "apps_involved": [
      "Gringotts",
      "HooliMail"
    ],
    "category": "retrieval",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent lists the recurring auto-charges from Gringotts bill_pay as they appear live \u2014 the full set of monthly-frequency subscriptions seeded in the table",
          "type": "llm_judge",
          "weight": 0.375
        },
        {
          "criterion": "Agent reports each subscription's exact monthly amount as listed in Gringotts bill_pay",
          "type": "llm_judge",
          "weight": 0.291667
        },
        {
          "criterion": "Agent lists the monthly recurring charges from the bill_pay table as they appear live (and optionally any separate credit card autopay or quarterly tax charges that aren't in bill_pay) without fabricating extra auto-charges",
          "type": "llm_judge",
          "weight": 0.166667
        },
        {
          "criterion": "Agent opens HooliMail and cross-references for any subscription welcome or renewal confirmations against the Gringotts recurring auto-charges; if no such emails are present, explicitly notes the absence rather than fabricating matches",
          "type": "llm_judge",
          "weight": 0.166667
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "retrieval-f009",
    "app": "multi_app",
    "instruction": "For my recent NYC trip, pull up the hotel confirmation, the flight confirmation, and the check-in date \u2014 I need to send them to someone. I stayed at the Greenwich and flew Dinoco.",
    "apps_involved": [
      "Cheskepdia",
      "Dinoco Airlines"
    ],
    "category": "retrieval",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent finds The Greenwich Hotel NYC booking in Cheskepdia and reports the live confirmation number as shown on the booking",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent finds the NYC Dinoco flight in Dinoco Airlines and reports the seeded flight confirmation code from the live booking",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent reports the NYC check-in date as seeded on the Greenwich Hotel booking",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent reports the Greenwich hotel confirmation, the Dinoco flight confirmation, and the NYC check-in date together in one response for the recent NYC trip",
          "type": "llm_judge",
          "weight": 0.15
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "personal_lookup"
  },
  {
    "id": "retrieval-f010",
    "app": "multi_app",
    "instruction": "What was the total cost of my Jamaica trip \u2014 pull the trip total from the booking. Also tell me the host name on file and what amenities the property comes with so I know what I'm getting.",
    "apps_involved": [
      "Cheskepdia"
    ],
    "category": "retrieval",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens Cheskepdia and loads the upcoming Sandals Montego Bay (Jamaica) booking",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent reports the trip total cost as it is shown on the live Cheskepdia Sandals Montego Bay booking",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent reports the host name as listed on the live Cheskepdia Sandals Montego Bay booking (Sandals Resorts Concierge)",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent reports the amenities list shown on the live Cheskepdia Sandals Montego Bay booking (or notes that no amenities are listed if the booking shows none)",
          "type": "llm_judge",
          "weight": 0.25
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "personal_lookup"
  },
  {
    "id": "retrieval-f029",
    "app": "multi_app",
    "instruction": "What was my gross income on my most-recent W-2, who was the employer, and how much federal tax got withheld? Double-check the numbers against ~/Documents/Tax_2025/w2_summary.txt and SpeedTax. I need the numbers for a thing.",
    "apps_involved": [
      "SpeedTax",
      "Files"
    ],
    "category": "retrieval",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens SpeedTax and locates the most-recent W-2 showing gross income and federal withholding fields",
          "type": "llm_judge",
          "weight": 0.333333
        },
        {
          "criterion": "Agent reports the gross wages shown on the most-recent SpeedTax W-2 and Dunder Mifflin as the employer",
          "type": "llm_judge",
          "weight": 0.333333
        },
        {
          "criterion": "Agent reports the federal withholding shown on the most-recent SpeedTax W-2 and cross-checks the value against w2_summary.txt",
          "type": "llm_judge",
          "weight": 0.333335
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "personal_lookup"
  },
  {
    "id": "retrieval-f030",
    "app": "multi_app",
    "instruction": "What was my 1099 income on the most-recent return, who paid it, and how much did I claim for charitable contributions? Cross-check the 1099 payer and amount against ~/Documents/Tax_2025/1099s.txt. This is for my accountant.",
    "apps_involved": [
      "SpeedTax",
      "Files"
    ],
    "category": "retrieval",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens SpeedTax and finds the most-recent 1099 entry showing the income amount and payer for the accountant summary",
          "type": "llm_judge",
          "weight": 0.266666
        },
        {
          "criterion": "Agent reports the most-recent SpeedTax 1099 income amount from Scranton Improv Academy and the charitable-contribution total from the deductions section",
          "type": "llm_judge",
          "weight": 0.266666
        },
        {
          "criterion": "Agent opens ~/Documents/Tax_2025/1099s.txt and cross-checks the 1099 income payer (Scranton Improv Academy) and amount against the SpeedTax value",
          "type": "llm_judge",
          "weight": 0.266668
        },
        {
          "criterion": "Agent navigates to the most-recent SpeedTax return's deductions section and surfaces the charitable contributions line for the accountant summary",
          "type": "llm_judge",
          "weight": 0.2
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "personal_lookup"
  },
  {
    "id": "retrieval-f035",
    "app": "multi_app",
    "instruction": "I made Holly a music playlist a while back and saved the track list as a text file somewhere on my computer. Where did I put it, and what songs are on it? I want to add a couple new ones tonight.",
    "apps_involved": [
      "Files"
    ],
    "category": "retrieval",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent locates the Holly playlist text file anywhere on Michael's filesystem and reports the absolute path where it was found",
          "type": "llm_judge",
          "weight": 0.35
        },
        {
          "criterion": "Agent opens / reads the file (does not summarize from memory)",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent lists the actual tracks that appear in the playlist file (not generic love songs)",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent's response is grounded in the file content, not in fan-fiction guesses about what Michael would put on a Holly playlist",
          "type": "llm_judge",
          "weight": 0.15
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "personal_lookup"
  },
  {
    "id": "situated_action-f003",
    "app": "multi_app",
    "instruction": "Email Toby and ask him to sign off on my Q2 Team Morale Initiative. The project doc is in my Documents folder; pull a short summary from it so he knows what he's approving. Keep it professional. (Yes, even though it's Toby.)",
    "apps_involved": [
      "HooliMail",
      "Files"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens the team_morale_initiative_q2 doc and uses it to summarize the sign-off ask",
          "type": "llm_judge",
          "weight": 0.2778
        },
        {
          "criterion": "Agent sends the HooliMail message to toby.flenderson@dundermifflin.com with a subject line referencing the Q2 Team Morale Initiative HR sign-off ask",
          "type": "llm_judge",
          "weight": 0.2778
        },
        {
          "criterion": "Agent writes the Toby sign-off email in a short workplace tone without personal hostility or jabs at Toby",
          "type": "llm_judge",
          "weight": 0.1668
        },
        {
          "criterion": "Agent sends a new outbound HooliMail from michael.scott@dundermifflin.com to toby.flenderson@dundermifflin.com requesting Q2 Team Morale sign-off",
          "type": "llm_judge",
          "weight": 0.2776
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f005",
    "app": "multi_app",
    "instruction": "I've got money riding on that Office reboot rumor over on my prediction-market account. First double-check my position is still open, then DM Dwight and ask him straight up if he's heard anything through the Scranton grapevine. Tell him I've got skin in the game.",
    "apps_involved": [
      "HooliWork",
      "OddsMarket"
    ],
    "category": "situated_action",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens OddsMarket and checks the NBC Office reboot/revival position before composing, reporting its current status (open or settled) rather than fabricating",
          "type": "llm_judge",
          "weight": 0.192337
        },
        {
          "criterion": "DM is sent to Dwight Schrute (not posted to a public channel)",
          "type": "llm_judge",
          "weight": 0.288506
        },
        {
          "criterion": "Agent's DM to Dwight references the Office reboot rumor and notes Michael's OddsMarket prediction-market stake",
          "type": "llm_judge",
          "weight": 0.288352
        },
        {
          "criterion": "Agent opens HooliWork and starts or continues a direct message thread with Dwight Schrute before composing the Office reboot grapevine question",
          "type": "llm_judge",
          "weight": 0.230805
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f006",
    "app": "multi_app",
    "instruction": "Book a ride to AVP for the morning of my upcoming Jamaica flight. First confirm the exact departure time in the airline app, then schedule a pickup from my place with enough cushion so I'm not sprinting through security. Drop a matching reminder on my calendar too.",
    "apps_involved": [
      "eTaxi",
      "Dinoco Airlines",
      "HooliCalendar"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent verifies the flight time 06:00 in Dinoco Airlines before booking",
          "type": "llm_judge",
          "weight": 0.2499
        },
        {
          "criterion": "eTaxi pickup is scheduled for the live Jamaica departure date from Dinoco with at least 2.5h buffer before the flight",
          "type": "llm_judge",
          "weight": 0.2499
        },
        {
          "criterion": "Pickup uses 1725 Slough Ave, Apt 4B and dropoff is AVP",
          "type": "llm_judge",
          "weight": 0.2499
        },
        {
          "criterion": "HooliCalendar reminder event is created for the pickup",
          "type": "llm_judge",
          "weight": 0.2503
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f007",
    "app": "multi_app",
    "instruction": "Can you book me a table at Cotogna for next Wednesday at 6:30pm, party of 6? Big group dinner. And throw it on my calendar so I don't forget.",
    "apps_involved": [
      "TableFind",
      "HooliCalendar"
    ],
    "category": "situated_action",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent books at Cotogna (the seeded Downtown Scranton Italian TableFind venue), not a different venue",
          "type": "llm_judge",
          "weight": 0.274812
        },
        {
          "criterion": "Agent books the Cotogna reservation for party of 6 at 6:30pm on the next upcoming Wednesday date",
          "type": "llm_judge",
          "weight": 0.274812
        },
        {
          "criterion": "Agent creates a matching HooliCalendar event block for the Cotogna reservation at 6:30pm so Michael does not forget",
          "type": "llm_judge",
          "weight": 0.219572
        },
        {
          "criterion": "Agent confirms and reserves the Cotogna TableFind booking so a confirmation screen or confirmation number for the next-Wednesday 6:30pm party of 6 reservation appears",
          "type": "llm_judge",
          "weight": 0.230805
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f008",
    "app": "multi_app",
    "instruction": "Pull the plug on the Jamaica trip. Cancel the Dinoco flight, wipe the trip dates off my calendar, send a DM to Jim letting him know it's off, and message the Sandals concierge through Cheskepdia (since their host-message flow lets us request a manual cancel) so the hotel knows we're not coming.",
    "apps_involved": [
      "Cheskepdia",
      "Dinoco Airlines",
      "HooliCalendar",
      "HooliChat"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Michael's Jamaica Dinoco flight is cancelled in the airline app (matching the seeded Dinoco Jamaica booking)",
          "type": "llm_judge",
          "weight": 0.255
        },
        {
          "criterion": "HooliCalendar Jamaica block (spanning the live Dinoco/Sandals trip window) is removed or marked cancelled",
          "type": "llm_judge",
          "weight": 0.205
        },
        {
          "criterion": "HooliChat DM is sent to Jim Halpert announcing the trip is cancelled",
          "type": "llm_judge",
          "weight": 0.245
        },
        {
          "criterion": "Agent opens Michael's Cheskepdia Sandals Montego Bay booking and sends a host message requesting a manual cancellation (Michael prefers the host-message path so the hotel knows the reason, even though Cheskepdia also supports self-cancel)",
          "type": "llm_judge",
          "weight": 0.205
        },
        {
          "criterion": "Agent completes all four cancellation legs end-to-end",
          "type": "llm_judge",
          "weight": 0.09
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "situated_action-f010",
    "app": "multi_app",
    "instruction": "Time to order trophies for this year's Dundie ceremony. Open HooliShop, browse my order history briefly to see if there's a previous trophy or award-style purchase to reference, then place a new HooliShop order for a quantity that matches how many categories we have in my Dundies 2026 categories doc. Use the closest equivalent award-style product available in the catalog (World's Best Boss Mug or similar trophy/award product is fine).",
    "apps_involved": [
      "HooliShop",
      "Files"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent scans HooliShop order history for any prior trophy or award-style purchase, noting whether one exists or not (either outcome is acceptable)",
          "type": "llm_judge",
          "weight": 0.2858
        },
        {
          "criterion": "Agent cross-references Dundies 2026 Categories doc for trophy count",
          "type": "llm_judge",
          "weight": 0.2142
        },
        {
          "criterion": "Agent adds a trophy or award-style product (closest equivalent in the HooliShop catalog) to the cart with a quantity matching the number of Dundies 2026 categories",
          "type": "llm_judge",
          "weight": 0.2858
        },
        {
          "criterion": "Agent places the HooliShop Dundie trophy order so a new order row appears matching the Dundies ceremony category count",
          "type": "llm_judge",
          "weight": 0.2142
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f012",
    "app": "multi_app",
    "instruction": "Can you Zelle Pam a hundred bucks? She covered me last weekend and I owe her. Check HooliChat first to make sure Pam Beesly is on my contacts, then put a memo on it so she knows what it's for.",
    "apps_involved": [
      "Gringotts",
      "HooliChat"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent sends a $100 Zelle transfer to Pam Beesly as the recipient, matching Michael's request for a hundred bucks to pay her back",
          "type": "llm_judge",
          "weight": 0.33337
        },
        {
          "criterion": "Memo references paying Pam back / covering / owing her",
          "type": "llm_judge",
          "weight": 0.222136
        },
        {
          "criterion": "Agent opens HooliChat and confirms Pam Beesly is on Michael's contacts before starting the $100 Zelle transfer in Gringotts",
          "type": "llm_judge",
          "weight": 0.222247
        },
        {
          "criterion": "Agent opens Gringotts and starts a new Zelle transfer flow before entering Pam Beesly as the recipient and $100 as the amount",
          "type": "llm_judge",
          "weight": 0.222247
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f013",
    "app": "multi_app",
    "instruction": "Write me a LockedIn post announcing the Dundies 2026 so we can build some buzz. Pull a little teaser from my Dundies categories doc so people know it's going to be big. Make it sound like it's from the World's Best Boss.",
    "apps_involved": [
      "LockedIn",
      "Files"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Post references the Dundies 2026 event explicitly",
          "type": "llm_judge",
          "weight": 0.274615
        },
        {
          "criterion": "Agent writes the Dundies 2026 LockedIn post in a voice matching Michael's headline 'World's Best Boss | Regional Manager | Improv Comedian | Author of Somehow I Manage' persona",
          "type": "llm_judge",
          "weight": 0.274615
        },
        {
          "criterion": "Agent pulled teaser detail from the Dundies 2026 Categories doc",
          "type": "llm_judge",
          "weight": 0.22
        },
        {
          "criterion": "Agent publishes the Dundies 2026 announcement post on LockedIn so it appears in Michael's LockedIn feed as a new published post",
          "type": "llm_judge",
          "weight": 0.230769
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f014",
    "app": "multi_app",
    "instruction": "Check my LockedIn for pending connection requests. Triage them: accept any that look legitimate (Dunder Mifflin colleagues OR external folks with thoughtful, non-spammy messages) and leave or decline the rest. I don't need more recruiters in my life \u2014 but use your judgment per request. If none of them are Dunder Mifflin colleagues, that's a perfectly fine answer.",
    "apps_involved": [
      "LockedIn",
      "HooliWork"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent navigates to the LockedIn invitations page",
          "type": "llm_judge",
          "weight": 0.288462
        },
        {
          "criterion": "Agent triages each pending LockedIn invitation thoughtfully: accepts legitimate-looking requests (any Dunder Mifflin colleagues that appear, plus external senders with genuinely thoughtful non-spam messages) and leaves or declines obvious recruiter spam. Either zero or one-plus DM colleagues in the queue is acceptable \u2014 what matters is the per-request judgment.",
          "type": "llm_judge",
          "weight": 0.288462
        },
        {
          "criterion": "Agent reports the triage outcome explicitly to Michael (count accepted, count left/declined, and explicitly states the count of Dunder Mifflin colleagues found in the pending queue \u2014 whether zero or more \u2014 so Michael knows the answer wasn't accidentally empty)",
          "type": "llm_judge",
          "weight": 0.192308
        },
        {
          "criterion": "Agent inspects each pending LockedIn connection request's profile (sender name, headline, message body) to make a per-request judgment, rather than blanket-accepting or blanket-declining; cross-checks HooliWork if any sender claims to be a Dunder Mifflin colleague",
          "type": "llm_judge",
          "weight": 0.230769
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f016",
    "app": "multi_app",
    "instruction": "I  want to keep an eye on a few prediction markets without putting money in yet. Browse OddsMarket, find any 3 markets that fit my taste (Office/NBC-reboot themes, Scranton/Pennsylvania local stuff, NBA, celebrity romance, or whatever else feels on-brand) \u2014 pick from whatever the live markets list actually shows \u2014 and add each one to my watchlist so they show up there. Make sure it's not something I currently invested in. Don't place any trades.",
    "apps_involved": [
      "OddsMarket"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent browses the OddsMarket markets list and selects 3 markets matching Michael's taste profile (Office/NBC reboot, Scranton/PA local, NBA, celebrity romance, or similarly on-brand markets)",
          "type": "llm_judge",
          "weight": 0.230769
        },
        {
          "criterion": "Agent adds each of the 3 chosen markets to the OddsMarket watchlist via the watchlist add control (not by opening a position)",
          "type": "llm_judge",
          "weight": 0.307692
        },
        {
          "criterion": "Agent verifies all 3 newly-added markets appear in the OddsMarket watchlist view after adding",
          "type": "llm_judge",
          "weight": 0.256462
        },
        {
          "criterion": "Agent places zero new trades during the task \u2014 open-positions list and account balance are unchanged from the start of the session",
          "type": "llm_judge",
          "weight": 0.205077
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f021",
    "app": "multi_app",
    "instruction": "Can you drop this year's Dundies category list into the Dundie Awards Planning group chat? The full list is in my Dundies 2026 categories doc.",
    "apps_involved": [
      "HooliChat",
      "Files"
    ],
    "category": "situated_action",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Message is posted specifically to the 'Dundie Awards Planning' group (not Party Planning Committee or elsewhere)",
          "type": "llm_judge",
          "weight": 0.2692
        },
        {
          "criterion": "Content reflects the categories from the Dundies 2026 Categories doc",
          "type": "llm_judge",
          "weight": 0.2309
        },
        {
          "criterion": "Agent opens the Dundies 2026 Categories doc under ~/Documents to read the full category list before composing the HooliChat message",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent sends the composed Dundies categories message into the HooliChat 'Dundie Awards Planning' group thread so it appears in the chat history",
          "type": "llm_judge",
          "weight": 0.25
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f025",
    "app": "multi_app",
    "instruction": "Can you DM Holly and confirm the Jamaica dates plus the hotel I booked? Double-check the reservation details first so I don't give her the wrong info. I want her excited.",
    "apps_involved": [
      "HooliChat",
      "Cheskepdia"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens Cheskepdia and confirms the Sandals Montego Bay booking details for the Jamaica trip before sending the Holly DM",
          "type": "llm_judge",
          "weight": 0.220034
        },
        {
          "criterion": "Agent's HooliChat DM to Holly states both the Jamaica check-in and check-out dates read live from the Cheskepdia Sandals Montego Bay booking",
          "type": "llm_judge",
          "weight": 0.274658
        },
        {
          "criterion": "DM is sent to Holly Flax specifically, not a group",
          "type": "llm_judge",
          "weight": 0.274504
        },
        {
          "criterion": "Agent's HooliChat DM to Holly Flax names the Sandals Montego Bay hotel that Michael booked alongside the Jamaica check-in and check-out dates",
          "type": "llm_judge",
          "weight": 0.230805
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f032",
    "app": "multi_app",
    "instruction": "Send Pam the Dundies 2026 categories doc as an attachment through email and ask her to proofread it. She's the best eye I know on this stuff.",
    "apps_involved": [
      "HooliMail",
      "Files"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "New HooliMail message is sent to pam.beesly@dundermifflin.com with the Dundies categories attached",
          "type": "llm_judge",
          "weight": 0.359231
        },
        {
          "criterion": "Agent attaches the Dundies 2026 Categories doc to the HooliMail message addressed to Pam",
          "type": "llm_judge",
          "weight": 0.256154
        },
        {
          "criterion": "Agent's HooliMail body to Pam asks her to proofread the attached Dundies 2026 Categories doc",
          "type": "llm_judge",
          "weight": 0.153846
        },
        {
          "criterion": "Agent locates the Dundies 2026 Categories doc at ~/Documents/Dundies_2026_Categories.txt in Files before attaching it to the HooliMail message to Pam",
          "type": "llm_judge",
          "weight": 0.230769
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f035",
    "app": "multi_app",
    "instruction": "I've got Dundies planning emails sitting in my inbox. Hit the most recent ones (up to five) and reply to each with the Cooper's Seafood House pitch. Don't copy-paste; tailor each reply a little so Pam, Jim, and the others don't think I'm lazy. Use my Dundies categories doc if you need context.",
    "apps_involved": [
      "HooliMail",
      "Files"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent sends a new outbound reply mentioning Cooper's Seafood House from michael.scott@dundermifflin.com to every Dundies planning email it can find in the inbox (up to 5; whatever count exists is the correct answer)",
          "type": "llm_judge",
          "weight": 0.329642
        },
        {
          "criterion": "Agent addresses each Cooper's Seafood House pitch reply to the actual sender of the planning thread it is replying to (different recipients per thread; if only 2 Dundies threads exist, 2 distinct recipients is the correct outcome)",
          "type": "llm_judge",
          "weight": 0.219659
        },
        {
          "criterion": "Agent writes each Cooper's Seafood House pitch reply with tailored context per recipient instead of identical copy-paste body text",
          "type": "llm_judge",
          "weight": 0.219966
        },
        {
          "criterion": "Agent opens the Dundies 2026 Categories doc to pull category context that informs the tailored Cooper's Seafood House pitch replies to the Dundies planning contacts",
          "type": "llm_judge",
          "weight": 0.230734
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f037",
    "app": "multi_app",
    "instruction": "I'm going to miss an improv class because of Jamaica. Can you pull up a recent charge to confirm my improv academy, then email them and ask for a makeup class around my trip dates? I don't want to lose the spot. The contact email is in my Improv class certificate under ~/Downloads.",
    "apps_involved": [
      "HooliMail",
      "Gringotts",
      "Files"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent verifies the Scranton Improv Academy vendor in Gringotts before composing, and reads ~/Downloads/Improv_Class_Certificate.txt (the canonical seeded source for the academy's contact email) to look up the recipient",
          "type": "llm_judge",
          "weight": 0.2352
        },
        {
          "criterion": "Email references the Jamaica trip as the reason for needing a makeup class, with a date window tied to the trip the agent establishes",
          "type": "llm_judge",
          "weight": 0.2944
        },
        {
          "criterion": "Email explicitly requests a make-up or rescheduled class",
          "type": "llm_judge",
          "weight": 0.2352
        },
        {
          "criterion": "Agent sends a new outbound HooliMail message to a Scranton Improv Academy contact (e.g., scheduling@scrantonimprov.com, info@scrantonimprov.com) after TASK_START asking for a makeup class around Jamaica dates",
          "type": "llm_judge",
          "weight": 0.2352
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f040",
    "app": "multi_app",
    "instruction": "I need an out-of-office message ready for the Jamaica trip. Pull the dates from my Jamaica trip doc so they're right, and write a draft email body I can paste-as-a-reply or save as a template. For anything urgent route people to Dwight. Save it as a HooliMail draft so it's waiting for me on the day I leave.",
    "apps_involved": [
      "HooliMail",
      "Files"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent writes the HooliMail Jamaica out-of-office message template spanning the seeded Sandals Montego Bay live check-in/check-out dates read from the Jamaica trip doc",
          "type": "llm_judge",
          "weight": 0.20873
        },
        {
          "criterion": "Agent's Jamaica out-of-office message states that urgent requests should go to Dwight Schrute at dwight.schrute@dundermifflin.com",
          "type": "llm_judge",
          "weight": 0.20873
        },
        {
          "criterion": "Agent verifies dates against the Jamaica trip doc before composing",
          "type": "llm_judge",
          "weight": 0.16746
        },
        {
          "criterion": "The OOO message is saved as a HooliMail draft so Michael can paste-as-reply or enable when leaving",
          "type": "llm_judge",
          "weight": 0.20873
        },
        {
          "criterion": "Agent opens HooliMail and starts a new compose window for the Jamaica out-of-office template",
          "type": "llm_judge",
          "weight": 0.206349
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "long_horizon-f041",
    "app": "multi_app",
    "instruction": "I'm bleeding cash on subscriptions and I need to do something about it. Pull every recurring charge in my Gringotts account, then write me a kill-list memo in LibreOffice Writer naming which 3-4 are the most expensive so I remember to call them. Then post a self-deprecating 'getting my financial act together' update to LockedIn. Throw OddsMarket into the audit too \u2014 any 'investment' positions I've left open are basically a subscription to losing money.",
    "apps_involved": [
      "Gringotts",
      "Files",
      "LibreOffice",
      "LockedIn",
      "HangryDash",
      "OddsMarket"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent enumerates every recurring Gringotts bill_pay charge with payee, amount, and frequency",
          "type": "llm_judge",
          "weight": 0.21
        },
        {
          "criterion": "Agent cross-references each subscription against whichever usage source is most relevant (e.g., LockedIn post count for a Premium sub, Files for untouched media/storage, HangryDash orders for food-delivery) to identify the 3-4 worst offenders; if no matching usage source exists for a given subscription, explicitly notes the absence rather than forcing a match",
          "type": "llm_judge",
          "weight": 0.23
        },
        {
          "criterion": "Agent writes the kill-list memo in LibreOffice Writer naming the 3-4 most expensive subscriptions and saves it under ~/Documents",
          "type": "llm_judge",
          "weight": 0.19
        },
        {
          "criterion": "Agent posts a self-deprecating 'getting my financial act together' or 'financial discipline' update to LockedIn referencing the kill-list findings",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent verifies the LockedIn post and saved kill-list memo both persist at the end of the task",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent surfaces OddsMarket open positions as part of the subscription-style spend audit and notes them as cuttable if they're underwater",
          "type": "llm_judge",
          "weight": 0.07
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f042",
    "app": "multi_app",
    "instruction": "I want to do a deep spring clean. Open the Files app (Nautilus), organize ~/Documents into a sane folder structure (Trips/, Tax_2025/, Projects/ already exist \u2014 keep those, group everything loose into Personal/ vs Work/), then triage HooliMail by archiving anything older than 6 months. Save a one-page write-up in LibreOffice describing the new system. Post an 'inbox zero' celebration message in HooliWork #random. Use Files (Nautilus) for the actual move/rename steps so I can see the tree structure visually as it gets organized.",
    "apps_involved": [
      "Files",
      "HooliMail",
      "HooliWork",
      "LibreOffice"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent uses Files (Nautilus) to survey the existing ~/Documents tree and identifies which loose files should go into Personal/ vs Work/",
          "type": "llm_judge",
          "weight": 0.169811
        },
        {
          "criterion": "Agent creates ~/Documents/Personal/ and ~/Documents/Work/ folders (if missing) and moves loose files into them by topic",
          "type": "llm_judge",
          "weight": 0.188679
        },
        {
          "criterion": "Agent opens HooliMail and archives messages older than 6 months from the inbox",
          "type": "llm_judge",
          "weight": 0.169811
        },
        {
          "criterion": "Agent writes a one-page 'My new file/email system' memo in LibreOffice Writer and saves it under ~/Documents",
          "type": "llm_judge",
          "weight": 0.169811
        },
        {
          "criterion": "Agent posts an 'inbox zero today' celebration message in the HooliWork #random channel",
          "type": "llm_judge",
          "weight": 0.141509
        },
        {
          "criterion": "Agent confirms each artifact was created/moved (memo saved, files re-organized, messages archived, channel post visible) without requiring exact counts",
          "type": "llm_judge",
          "weight": 0.103774
        },
        {
          "criterion": "Agent uses the Files (Nautilus) GUI (not just shell commands) for the visible file-tree reorganization so directory moves are observable in the file-manager UI",
          "type": "llm_judge",
          "weight": 0.056604
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f043",
    "app": "multi_app",
    "instruction": "Am I actually showing up to improv class consistently? Pair my recurring Scranton Improv charges in Gringotts against my HooliCalendar improv blocks, list any charges that don't have a matching calendar block (or vice versa), reference my Improv Class Certificate file in ~/Downloads/ for academy context, and email the Scranton Improv Academy a friendly note explaining the gaps and asking about a make-up class. Save the attendance memo to ~/Documents.",
    "apps_involved": [
      "Gringotts",
      "HooliCalendar",
      "HooliMail",
      "Files",
      "LibreOffice"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent pulls every Scranton Improv-related charge from Gringotts (the bi-weekly SCRANTON IMPROV ACADEMY subscription transactions as they appear live) with date and amount",
          "type": "llm_judge",
          "weight": 0.181818
        },
        {
          "criterion": "Agent pulls every improv-related event from HooliCalendar with date and time",
          "type": "llm_judge",
          "weight": 0.163636
        },
        {
          "criterion": "Agent pairs charges to calendar blocks and lists mismatches (charges with no block, blocks with no charge)",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent composes and sends/drafts a HooliMail to a Scranton Improv Academy contact (any reasonable academy address) explaining the attendance gaps and asking about a make-up class",
          "type": "llm_judge",
          "weight": 0.181818
        },
        {
          "criterion": "Agent writes the attendance reconciliation memo in LibreOffice Writer and saves it under ~/Documents",
          "type": "llm_judge",
          "weight": 0.118182
        },
        {
          "criterion": "Agent verifies the email sent + memo file + reconciliation summary all persist",
          "type": "llm_judge",
          "weight": 0.063636
        },
        {
          "criterion": "Agent reads ~/Downloads/Improv_Class_Certificate.txt for academy context (completion level, instructor comments) and references it in the email rather than fabricating academy details",
          "type": "llm_judge",
          "weight": 0.090909
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f044",
    "app": "multi_app",
    "instruction": "Write me a NYC trip post-mortem. Pull the Greenwich Hotel booking total from Cheskepdia, the NYC flight cost from Dinoco, and any related Gringotts charges. Compile the full damage in LibreOffice Writer, save under ~/Documents/Trips/, then submit a 4-star Cheskepdia review for The Greenwich praising the location but noting it ran a bit pricey. Optionally throw a 'NYC was great, here's what I learned' update to LockedIn.",
    "apps_involved": [
      "Cheskepdia",
      "Dinoco Airlines",
      "Gringotts",
      "HooliCalendar",
      "LibreOffice",
      "Files",
      "LockedIn"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent pulls The Greenwich Hotel booking total from Cheskepdia",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent pulls the Dinoco NYC flight cost from the airline app",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent cross-references Gringotts for related NYC trip charges in the Dinoco NYC flight window (the 3-4 day NYC block around the Dinoco booking)",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent reads the HooliCalendar NYC trip block (the calendar events spanning the Dinoco NYC flight window) for trip context",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent compiles the post-mortem write-up in LibreOffice Writer and saves it under ~/Documents/Trips/",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent submits a 4-star Cheskepdia review for The Greenwich Hotel praising location and noting price",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent posts a 'NYC trip recap' update to LockedIn",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent verifies the saved post-mortem, the Cheskepdia review, and the LockedIn post all persist",
          "type": "llm_judge",
          "weight": 0.05
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f045",
    "app": "multi_app",
    "instruction": "I want a snapshot of all my IT-style assets. Open Files (Nautilus) and inventory ~/Documents (file count + total size by subfolder), pull the project list from SprintBoard's three projects (Team Morale Initiative Q2, Diversity Day Planning 2026, Movie Monday Program), and write me a one-page IT inventory memo in LibreOffice. Then post a 'just did an IT inventory' message in HooliWork #random so the team knows I'm getting my house in order. Use Files (Nautilus) to walk the ~/Documents tree visually as part of the IT inventory pass \u2014 the file manager view is what makes it obvious what's actually in there.",
    "apps_involved": [
      "Files",
      "LibreOffice",
      "HooliWork",
      "SprintBoard"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent uses Files (Nautilus) to inventory ~/Documents files per subfolder with counts and total size",
          "type": "llm_judge",
          "weight": 0.207547
        },
        {
          "criterion": "Agent pulls the SprintBoard project list (Team Morale Initiative Q2, Diversity Day Planning 2026, Movie Monday Program) with task counts",
          "type": "llm_judge",
          "weight": 0.188679
        },
        {
          "criterion": "Agent composes a one-page IT inventory memo in LibreOffice Writer",
          "type": "llm_judge",
          "weight": 0.207547
        },
        {
          "criterion": "Agent saves the memo to ~/Documents",
          "type": "llm_judge",
          "weight": 0.150943
        },
        {
          "criterion": "Agent posts an 'IT inventory complete' message in HooliWork #random",
          "type": "llm_judge",
          "weight": 0.122642
        },
        {
          "criterion": "Agent verifies the saved memo + the channel post both persist",
          "type": "llm_judge",
          "weight": 0.066038
        },
        {
          "criterion": "Agent uses Files (Nautilus) to walk ~/Documents visually during the IT inventory pass (not just shell ls / cat)",
          "type": "llm_judge",
          "weight": 0.056604
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f046",
    "app": "multi_app",
    "instruction": "Jan rejected my latest expense report and I need to clean it up. Open the rejection thread in HooliMail (from Jan Levinson re: your expense report being rejected) and read the live list of flagged line items with reasons, then for each flagged line item pull supporting evidence from any source app that matches the spend category (e.g. TableFind/HangryDash for dining-type items, HooliShop for purchased goods, eTaxi for ride spend) \u2014 exact-amount matches are NOT required, approximate/partial matches in the same category and rough amount range are sufficient evidence. Also cross-check Gringotts for any matching card charges. Cross-check none of them are also claimed in my most-recent SpeedTax deductions to avoid double-dipping. Save the corrected expense report in LibreOffice Calc as a line-item reconciliation table as ~/Documents/Q1_expense_resubmit.ods and email it back to Jan as a HooliMail draft (don't send yet, I want to read it first).",
    "apps_involved": [
      "HooliMail",
      "SpeedTax",
      "Gringotts",
      "LibreOffice Calc",
      "HangryDash",
      "TableFind",
      "HooliShop",
      "Files"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens the Jan rejection email thread in HooliMail and extracts the specific rejected line items with reasons",
          "type": "llm_judge",
          "weight": 0.11801
        },
        {
          "criterion": "Agent reports, for each flagged item in Jan's rejection list, whether supporting evidence exists in the expected source-app category (TableFind/HangryDash for dining; HooliShop for purchased goods; eTaxi for ride spend) \u2014 approximate-amount or category-only matches are acceptable, and the agent does NOT need to find an exact-dollar match for the email's stated figures; the agent should explicitly note when only partial/approximate evidence is found",
          "type": "llm_judge",
          "weight": 0.163399
        },
        {
          "criterion": "Agent cross-checks Gringotts for matching card charges that loosely correspond to the flagged items (any approximate or category-consistent match counts; exact-amount match not required)",
          "type": "llm_judge",
          "weight": 0.11801
        },
        {
          "criterion": "Agent cross-checks the most-recent SpeedTax return's deductions to flag any rejected item that's already claimed (double-dip)",
          "type": "llm_judge",
          "weight": 0.136166
        },
        {
          "criterion": "Agent composes the corrected resubmission in LibreOffice Calc with per-item explanations",
          "type": "llm_judge",
          "weight": 0.145243
        },
        {
          "criterion": "Agent saves the resubmission to ~/Documents/Q1_expense_resubmit.ods",
          "type": "llm_judge",
          "weight": 0.090777
        },
        {
          "criterion": "Agent opens HooliMail and drafts a reply to Jan with the resubmission summary, saved as a draft (not sent)",
          "type": "llm_judge",
          "weight": 0.11801
        },
        {
          "criterion": "Agent confirms the draft, the saved doc, and the receipt-matching list all persist after the workflow ends",
          "type": "llm_judge",
          "weight": 0.036311
        },
        {
          "criterion": "Agent rebuilds the corrected expense report as a LibreOffice Calc table (one row per line item with source app, amount, deduction-status columns) so Jan can audit it cleanly",
          "type": "llm_judge",
          "weight": 0.074074
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f047",
    "app": "multi_app",
    "instruction": "I want to write proper Cheskepdia reviews of both Sandals stays. Pull both Sandals bookings from Cheskepdia (Sandals Montego Bay and Sandals Royal Barbados), cross-check Gringotts for the actual posted hotel charges, then write a 5-star review for each stay in Cheskepdia. Save the review drafts in LibreOffice under ~/Documents/Trips/ first so I can edit before posting. Email Holly a heads-up that I'm writing the Sandals reviews, and DM Pam in HooliChat that the Sandals trips were 'somehow I managed' levels of luxury. Post a 'luxury traveler era' update on LockedIn.",
    "apps_involved": [
      "Cheskepdia",
      "Gringotts",
      "LockedIn",
      "LibreOffice",
      "Files",
      "HooliMail",
      "HooliChat"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens both Sandals Cheskepdia bookings (Sandals Montego Bay and Sandals Royal Barbados) and reads the live totals / dates / property names / confirmation numbers",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent cross-checks Gringotts for the matching hotel charges on both Sandals bookings",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent drafts 5-star reviews for both stays in LibreOffice Writer and saves under ~/Documents/Trips/",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent either (a) saves both 5-star LibreOffice review drafts under ~/Documents/Trips/ AND submits both reviews on the Sandals Montego Bay and Royal Barbados Cheskepdia bookings, OR (b) saves both LibreOffice drafts AND composes both Cheskepdia reviews as drafts awaiting Michael's review before posting (honoring the 'first so I can edit before posting' instruction)",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent emails Holly via HooliMail about writing the Sandals reviews",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent DMs Pam in HooliChat with a 'somehow I managed' luxury Sandals message",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent posts a 'luxury traveler era' update to LockedIn referencing the Sandals stays",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent verifies both reviews + both messages + the LockedIn post all persist after refresh",
          "type": "llm_judge",
          "weight": 0.03
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f048",
    "app": "multi_app",
    "instruction": "Time for a full prior-year financial close-out \u2014 I want to know what kind of mess I made and what to do about it. Pull my full prior-year Gringotts spend by category, my BatBucks portfolio at cost basis, my OddsMarket position summary, and my SpeedTax most-recent-filed-year status. Identify the top 3 leak categories where I overspent. Save the year-end rollup (category x month) as a LibreOffice Calc spreadsheet under ~/Documents so it's sortable with formulas, and title it 'Financial close-out + current-year game plan.' Send myself a journal email of the takeaways via HooliMail. Drop a private 'fiscal discipline' update in HooliChat's Finer Things Club group, and a public version on LockedIn.",
    "apps_involved": [
      "Gringotts",
      "BatBucks",
      "OddsMarket",
      "SpeedTax",
      "LibreOffice Calc",
      "HooliMail",
      "Files",
      "LockedIn",
      "HooliChat"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent pulls Gringotts prior-year total spend grouped by category",
          "type": "llm_judge",
          "weight": 0.118182
        },
        {
          "criterion": "Agent pulls BatBucks cost-basis portfolio total and holdings list",
          "type": "llm_judge",
          "weight": 0.090909
        },
        {
          "criterion": "Agent pulls OddsMarket positions, balance, and PnL",
          "type": "llm_judge",
          "weight": 0.090909
        },
        {
          "criterion": "Agent pulls SpeedTax most-recent-filed return status and headline numbers",
          "type": "llm_judge",
          "weight": 0.090909
        },
        {
          "criterion": "Agent identifies the 3 highest-overrun spending categories",
          "type": "llm_judge",
          "weight": 0.109091
        },
        {
          "criterion": "Agent composes the close-out write-up in LibreOffice and saves under ~/Documents",
          "type": "llm_judge",
          "weight": 0.136364
        },
        {
          "criterion": "Agent sends a self-journal email via HooliMail with the takeaways",
          "type": "llm_judge",
          "weight": 0.090909
        },
        {
          "criterion": "Agent posts a 'fiscal discipline' message in the HooliChat Finer Things Club group",
          "type": "llm_judge",
          "weight": 0.090909
        },
        {
          "criterion": "Agent posts a 'fiscal discipline' update on LockedIn",
          "type": "llm_judge",
          "weight": 0.090909
        },
        {
          "criterion": "Agent uses LibreOffice Calc (spreadsheet, not Writer) so the year-end rollup is structured by category x month with formulas, not free-form prose",
          "type": "llm_judge",
          "weight": 0.090909
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f049",
    "app": "multi_app",
    "instruction": "TLM2 production roadmap. Open the Threat Level Midnight sequel notes in ~/Documents/Projects/, expand the outline by at least one act, scope the budget by tallying available cash (BatBucks dividends + OddsMarket balance + Gringotts savings), search Cheskepdia for production-friendly venues, scope catering via TableFind + HangryDash, write a teaser LockedIn post + a Fan Club teaser in HooliChat, and stand up a SprintBoard task list to manage the production. Save the expanded outline back to the same file. Save the production budget as a LibreOffice Calc spreadsheet (categories x sources) \u2014 the outline itself stays in the existing notes file.",
    "apps_involved": [
      "Files",
      "LibreOffice Calc",
      "BatBucks",
      "OddsMarket",
      "Gringotts",
      "Cheskepdia",
      "TableFind",
      "HangryDash",
      "LockedIn",
      "HooliChat",
      "SprintBoard"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reads ~/Documents/Projects/Threat_Level_Midnight_Sequel_Notes.txt as the starting outline",
          "type": "llm_judge",
          "weight": 0.092593
        },
        {
          "criterion": "Agent expands the outline by at least one new act and writes the additions back to the file",
          "type": "llm_judge",
          "weight": 0.148148
        },
        {
          "criterion": "Agent tallies the production budget by summing BatBucks dividends + OddsMarket balance + Gringotts savings",
          "type": "llm_judge",
          "weight": 0.12963
        },
        {
          "criterion": "Agent searches Cheskepdia for production-friendly venues and lists at least 2 candidates",
          "type": "llm_judge",
          "weight": 0.092593
        },
        {
          "criterion": "Agent scopes catering options via TableFind and HangryDash with at least 2 candidates total",
          "type": "llm_judge",
          "weight": 0.092593
        },
        {
          "criterion": "Agent writes a teaser LockedIn post about TLM2 in development",
          "type": "llm_judge",
          "weight": 0.111111
        },
        {
          "criterion": "Agent posts a Fan Club teaser in the HooliChat Threat Level Midnight Fan Club group",
          "type": "llm_judge",
          "weight": 0.092593
        },
        {
          "criterion": "Agent stands up a SprintBoard task list / project entry for the TLM2 production roadmap",
          "type": "llm_judge",
          "weight": 0.12037
        },
        {
          "criterion": "Agent saves the expanded outline back to the original notes file",
          "type": "llm_judge",
          "weight": 0.046296
        },
        {
          "criterion": "Agent saves the TLM2 production budget as a LibreOffice Calc spreadsheet split by funding source (BatBucks dividends + OddsMarket balance + Gringotts savings)",
          "type": "llm_judge",
          "weight": 0.074074
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f050",
    "app": "multi_app",
    "instruction": "Run the full upcoming-Dundies lifecycle plan. Open the Dundies categories doc in ~/Documents, build a venue shortlist on Cheskepdia, scope catering via TableFind + HangryDash, send save-the-date HooliMails to Pam, Jim, Dwight, Kevin, Holly. Coordinate logistics in the HooliChat Dundie Awards Planning group. Block the day in HooliCalendar. Stand up a Dundies SprintBoard task list. Post a teaser LockedIn 'Dundies are happening this year' update.",
    "apps_involved": [
      "HooliMail",
      "HooliChat",
      "LibreOffice",
      "Cheskepdia",
      "TableFind",
      "HangryDash",
      "HooliCalendar",
      "SprintBoard",
      "LockedIn",
      "Files"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens the Dundies categories doc in ~/Documents and reads the category list",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent builds a Cheskepdia venue shortlist of at least 2 candidates",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent scopes catering on TableFind and HangryDash with at least 2 candidates total",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent sends save-the-date HooliMails to Pam, Jim, Dwight, Kevin, and Holly",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent posts a logistics-coordination message in HooliChat Dundie Awards Planning",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent blocks the Dundies day on HooliCalendar with at least a half-day event",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent stands up a Dundies prep SprintBoard task list (or project) with at least 5 tasks",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent posts a 'Dundies are happening this year' teaser on LockedIn",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent confirms all artifacts (emails sent, channel post, calendar event, SprintBoard tasks, LockedIn post) persist",
          "type": "llm_judge",
          "weight": 0.06
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f051",
    "app": "multi_app",
    "instruction": "I want to dry-run a job search without actually quitting. Polish my LockedIn profile (review headline, about, post cadence). Draft a generic cover letter template in LibreOffice. Search Cheskepdia for a 1-bedroom in a major US city outside Scranton (NYC, Chicago, Philly \u2014 pick one). Pull my Gringotts checking + savings + BatBucks cash to compute my unemployment-buffer runway. Check Dinoco for any existing flights to interview-likely cities. Email myself a 'job search readiness' summary via HooliMail. Stage all artifacts under ~/Documents/Personal/job_search/ in Files.",
    "apps_involved": [
      "LockedIn",
      "LibreOffice",
      "HooliMail",
      "Cheskepdia",
      "Gringotts",
      "BatBucks",
      "Files",
      "Dinoco Airlines"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reviews LockedIn profile (headline, about, recent posts) and notes anything that needs polish",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent drafts a generic cover-letter template in LibreOffice Writer",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent searches Cheskepdia for 1-bedroom listings in NYC or Philly (both have seeded listings; Chicago has none) and shortlists 2-3",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent pulls Gringotts checking + savings + BatBucks cash and computes the unemployment-buffer runway in months",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent pulls Dinoco for any flights to interview-likely cities (or notes none exist yet)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent emails himself a 'job search readiness' summary via HooliMail",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent stages cover letter + runway calc + listings under ~/Documents/Personal/job_search/",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent verifies all 4-5 artifacts persist after the workflow ends",
          "type": "llm_judge",
          "weight": 0.06
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f053",
    "app": "multi_app",
    "instruction": "Pretzel Day is coming up this year and I want to make it the best one ever. Block the date on HooliCalendar, post the official Pretzel Day announcement in HooliWork #general + #random, get a head-count poll going in HooliChat Party Planning Committee, schedule a TableFind reservation for the post-Pretzel-Day team lunch, scope a HangryDash dessert order to bring back to the office for the second wave, write the run-of-show in LibreOffice and save it under ~/Documents/Work/, and post a 'come get pretzeled' tease on LockedIn so the whole Scranton paper world sees what they're missing. Order Pretzel Day swag (paper hats, salt packets, custom mugs that say 'World's Best Pretzel Boss') from HooliShop.",
    "apps_involved": [
      "HooliCalendar",
      "HooliWork",
      "HooliChat",
      "TableFind",
      "HangryDash",
      "LibreOffice",
      "Files",
      "LockedIn",
      "HooliShop"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent blocks Pretzel Day on HooliCalendar with appropriate reminders",
          "type": "llm_judge",
          "weight": 0.122642
        },
        {
          "criterion": "Agent posts the Pretzel Day announcement in HooliWork #general AND #random",
          "type": "llm_judge",
          "weight": 0.141509
        },
        {
          "criterion": "Agent starts a head-count / RSVP poll or thread in HooliChat Party Planning Committee",
          "type": "llm_judge",
          "weight": 0.122642
        },
        {
          "criterion": "Agent schedules a TableFind reservation for the post-Pretzel-Day team lunch",
          "type": "llm_judge",
          "weight": 0.122642
        },
        {
          "criterion": "Agent scopes / drafts a HangryDash dessert order for the office second-wave",
          "type": "llm_judge",
          "weight": 0.122642
        },
        {
          "criterion": "Agent writes the Pretzel Day run-of-show in LibreOffice and saves it under ~/Documents/Work/",
          "type": "llm_judge",
          "weight": 0.169811
        },
        {
          "criterion": "Agent posts a 'come get pretzeled' tease on LockedIn",
          "type": "llm_judge",
          "weight": 0.09434
        },
        {
          "criterion": "Agent confirms all artifacts (calendar block, channel posts, reservation, draft order, run-of-show, LockedIn tease) persist",
          "type": "llm_judge",
          "weight": 0.04717
        },
        {
          "criterion": "Agent orders Pretzel Day swag from HooliShop (e.g., 'World's Best Pretzel Boss' mugs, themed napkins) for the office distribution",
          "type": "llm_judge",
          "weight": 0.056604
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f054",
    "app": "multi_app",
    "instruction": "I keep daydreaming about starting the Michael Scott Paper Company on the side \u2014 same idea as before but smarter this time. Scope it as a real side-business research project: pull together a back-of-envelope feasibility memo. Use BatBucks to confirm I have reasonable seed capital available for a paper-distribution side business, OddsMarket to gauge market sentiment on regional paper distributors (any prediction markets), LockedIn to pull together 3-5 paper-adjacent candidates I'd want to recruit (accept any mix of established connections, pending connection_requests from paper/office-supply senders, and named ex-colleagues), HooliMail to draft a discreet 'exploring something new \u2014 let's grab coffee' note to those candidates, search Cheskepdia for a small co-working / serviced office for week-1 ops, scope Dinoco for a single weekend trip to Philadelphia for a regional supplier meet-and-greet, file the full feasibility memo in LibreOffice under ~/Documents/Personal/msp_company/, and post a vague 'cooking up something big' LockedIn teaser without naming the paper-company angle. The feasibility memo's financial model (capital + first-12-month burn) goes into LibreOffice Calc. Pre-fill a Hoolishop delivery with the bare-bones MSP Company week-1 office supplies (notebooks, pens, coffee, paper).",
    "apps_involved": [
      "BatBucks",
      "OddsMarket",
      "LockedIn",
      "HooliMail",
      "Cheskepdia",
      "Dinoco Airlines",
      "LibreOffice Calc",
      "Files",
      "Kwik-E-Mart",
      "HooliShop"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent pulls BatBucks at-cost portfolio + cash balance and assesses whether reasonable seed capital for a paper-distribution side business is realistic given Michael's live balances",
          "type": "llm_judge",
          "weight": 0.1157
        },
        {
          "criterion": "Agent skims OddsMarket for any prediction markets relevant to regional paper distributors (or notes none exist)",
          "type": "llm_judge",
          "weight": 0.089
        },
        {
          "criterion": "Agent identifies 3-5 paper-adjacent contacts via LockedIn for potential recruitment (accept established connections, pending connection_requests from paper/office-supply senders, or named ex-colleagues \u2014 minimum 3 named)",
          "type": "llm_judge",
          "weight": 0.133499
        },
        {
          "criterion": "Agent drafts a discreet 'exploring something new' HooliMail to those 5 contacts (saved as draft, NOT sent broadly)",
          "type": "llm_judge",
          "weight": 0.133499
        },
        {
          "criterion": "Agent searches Cheskepdia for a small co-working / serviced office and shortlists at least 1 option",
          "type": "llm_judge",
          "weight": 0.089
        },
        {
          "criterion": "Agent scopes Dinoco for a weekend Philadelphia trip for the supplier meet",
          "type": "llm_judge",
          "weight": 0.089
        },
        {
          "criterion": "Agent saves the full feasibility memo in LibreOffice under ~/Documents/Personal/msp_company/",
          "type": "llm_judge",
          "weight": 0.133499
        },
        {
          "criterion": "Agent posts a vague 'cooking up something big' LockedIn teaser without naming Michael Scott Paper Company explicitly",
          "type": "llm_judge",
          "weight": 0.1068
        },
        {
          "criterion": "Agent saves the MSP Company financial model (seed capital + first-12-month burn forecast) as a LibreOffice Calc workbook",
          "type": "llm_judge",
          "weight": 0.0534
        },
        {
          "criterion": "Agent pre-fills a HooliShop delivery with MSP Company week-1 office supplies (notebooks, pens, coffee, paper)",
          "type": "llm_judge",
          "weight": 0.056604
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f055",
    "app": "multi_app",
    "instruction": "I need to prep for my mid-year self-review with David Wallace. Pull my SprintBoard project record across all my active projects, sample HooliWork participation in #sales + #general (rough message count), look at my LockedIn posts + connections, check my HooliMail-sent volume to David Wallace + Jan, and pull my Gringotts paycheck deposits for comp context, then write a self-assessment in LibreOffice Writer. Schedule a HooliCalendar 1-on-1 with David Wallace. Email David an FYI that the self-review is coming via HooliMail.",
    "apps_involved": [
      "LockedIn",
      "HooliMail",
      "HooliChat",
      "SprintBoard",
      "LibreOffice",
      "Gringotts",
      "HooliWork",
      "HooliCalendar"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent tallies SprintBoard completed/in-progress tasks across Michael-assigned or Michael-reported projects",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent samples HooliWork participation in #sales and #general",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent reviews LockedIn post count and connections",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent tallies HooliMail sent volume to David Wallace and Jan",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent pulls Gringotts comp context (paycheck deposits)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent composes a self-assessment in LibreOffice Writer and saves it locally (any reasonable path)",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent schedules a HooliCalendar 1-on-1 with David Wallace",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent emails David Wallace via HooliMail with an FYI about the upcoming self-review",
          "type": "llm_judge",
          "weight": 0.14
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f056",
    "app": "multi_app",
    "instruction": "Diversity Day is coming up and Pam's running it on SprintBoard but I want to take it over and make this year actually iconic \u2014 I am the regional manager. Open the Diversity Day Planning 2026 board on SprintBoard and add 5 specific tasks (cultural showcase booths, dietary-friendly catering scope, comms plan, post-event survey, run-of-show doc). Block the event date on HooliCalendar with a 1-week prep sprint. Reserve a TableFind venue for the cultural-food sampling. Scope a HangryDash diverse-cuisines order for the day-of office spread. Pre-fill a Kwik-E-Mart office-supply pickup for decorations. Send the formal HooliMail invite to the all-hands. Post the announcement in HooliWork #general + HooliChat Party Planning Committee. Save the run-of-show in LibreOffice under ~/Documents/Work/. Post a LockedIn 'celebrating what makes Scranton, Scranton' message.",
    "apps_involved": [
      "SprintBoard",
      "HooliCalendar",
      "TableFind",
      "HangryDash",
      "Kwik-E-Mart",
      "HooliMail",
      "HooliWork",
      "HooliChat",
      "LibreOffice",
      "Files",
      "LockedIn"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens or creates a Diversity Day Planning 2026 SprintBoard board (project) and adds at least 5 specific new tasks covering cultural showcase, catering, comms, survey, and run-of-show",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent blocks the Diversity Day event on HooliCalendar with a 1-week prep sprint window",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent reserves a TableFind venue for the cultural-food sampling",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent scopes / drafts a HangryDash multi-cuisine order for the day-of office spread",
          "type": "llm_judge",
          "weight": 0.09
        },
        {
          "criterion": "Agent pre-fills a Kwik-E-Mart office-supply pickup for decorations",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent sends a formal HooliMail invite to the all-hands distribution list",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent posts the announcement in HooliWork #general AND in HooliChat Party Planning Committee",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent saves the Diversity Day run-of-show in LibreOffice under ~/Documents/Work/",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent posts a LockedIn 'celebrating Scranton' message about the event",
          "type": "llm_judge",
          "weight": 0.07
        },
        {
          "criterion": "Agent confirms all artifacts persist (board tasks, calendar block, reservation, draft order, supply pickup, invite, channel posts, run-of-show, LockedIn post)",
          "type": "llm_judge",
          "weight": 0.05
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f057",
    "app": "multi_app",
    "instruction": "I want to relaunch Movie Monday as a quarterly outdoor projection night at the office park \u2014 bigger than the indoor break-room version Pam keeps running. Open the Movie Monday Program board on SprintBoard and add 6 specific tasks (projector rental scope, outdoor movie license check, snack-bar coordination, weather-backup plan, RSVP comms, post-event highlight reel). Block 4 quarterly outdoor Movie Monday dates on HooliCalendar. Pre-fill a Kwik-E-Mart office-snacks pickup (popcorn + drinks) and place a HangryDash late-night dessert order for the first event. Draft the formal Movie Monday relaunch HooliMail to the all-hands. Post the launch in HooliChat Party Planning Committee + HooliWork #general. Save the full Movie Monday relaunch plan in LibreOffice under ~/Documents/Work/. Post a 'bringing back movie nights, properly this time' LockedIn announcement. The relaunch plan + per-quarter budget go into a LibreOffice Calc workbook so I can copy a tab per event.",
    "apps_involved": [
      "SprintBoard",
      "HooliCalendar",
      "Kwik-E-Mart",
      "HangryDash",
      "HooliMail",
      "HooliChat",
      "HooliWork",
      "LibreOffice Calc",
      "LockedIn"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens or creates a Movie Monday Program SprintBoard board (project) and adds at least 6 specific new tasks covering projector rental, license check, snacks, weather backup, RSVP comms, and highlight reel",
          "type": "llm_judge",
          "weight": 0.17
        },
        {
          "criterion": "Agent blocks 4 quarterly outdoor Movie Monday dates on HooliCalendar",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent pre-fills a Kwik-E-Mart office-snacks pickup with popcorn + drinks",
          "type": "llm_judge",
          "weight": 0.09
        },
        {
          "criterion": "Agent places a HangryDash late-night dessert order for the first event",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent drafts the formal Movie Monday relaunch HooliMail to the all-hands distribution",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent posts the launch in HooliChat Party Planning Committee AND HooliWork #general",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent saves the full Movie Monday relaunch plan in LibreOffice under ~/Documents/Work/",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent posts a 'bringing back movie nights properly this time' LockedIn announcement",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent confirms all artifacts persist (board tasks, 4 calendar blocks, supply pickup, dessert delivery, draft email, channel posts, relaunch plan, LockedIn post)",
          "type": "llm_judge",
          "weight": 0.03
        },
        {
          "criterion": "Agent saves the Movie Monday relaunch plan + per-quarter budget as a LibreOffice Calc workbook with one tab per quarterly event",
          "type": "llm_judge",
          "weight": 0.07
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f058",
    "app": "multi_app",
    "instruction": "I'm thinking about moving closer to the office. Apartment shortlist on Cheskepdia, scope move costs against Gringotts checking + savings, calculate eTaxi savings if I'm closer (compare current commute count to 1725 Slough Ave), block the projected move week on HooliCalendar, propose change-of-address handling in HooliMail to friends/family + HooliChat to the team, stand up a SprintBoard board for the move, write the move plan in LibreOffice, post a 'big change coming' LockedIn note, scope grocery resupply via Kwik-E-Mart + HangryDash for first-day-in-new-place. Save under ~/Documents/Personal/move/.",
    "apps_involved": [
      "Cheskepdia",
      "Gringotts",
      "HooliCalendar",
      "eTaxi",
      "HooliMail",
      "HooliChat",
      "SprintBoard",
      "LibreOffice",
      "Files",
      "LockedIn",
      "Kwik-E-Mart",
      "HangryDash"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent searches Cheskepdia for accommodations closer to 1725 Slough Ave (Scranton, PA) and shortlists 2-3 candidates from the search results; accept apartments, studios, or any residential listing in Scranton",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent pulls Gringotts checking + savings and scopes a move budget",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent pulls eTaxi commute history and estimates the savings if closer",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent blocks the projected move-week dates on HooliCalendar",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent composes at least 2 change-of-address HooliMails to friends/family",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent posts a heads-up about the move in a HooliChat office/team group (e.g. Party Planning Committee or any team conversation)",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent stands up a SprintBoard board for the move with at least 5 tasks",
          "type": "llm_judge",
          "weight": 0.13
        },
        {
          "criterion": "Agent writes the move plan in LibreOffice and saves under ~/Documents/Personal/move/",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent scopes first-day-in-new-place groceries via Kwik-E-Mart and HangryDash",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent posts a 'big change coming' note on LockedIn",
          "type": "llm_judge",
          "weight": 0.07
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f060",
    "app": "multi_app",
    "instruction": "I want a real vacation this year \u2014 somewhere I haven't been (not Jamaica, not Barbados, not NYC, not Philly). Research a destination on Cheskepdia, book me flights from AVP on Dinoco, scope eTaxi for the airport ride, build me a day-by-day itinerary in LibreOffice, block the dates on my calendar, send a group invite to Pam, Jim, and Holly via HooliMail, check my Gringotts for budget, post a 'big trip coming' tease on LockedIn, and save the whole plan under ~/Documents/Trips/.",
    "apps_involved": [
      "Cheskepdia",
      "Dinoco Airlines",
      "eTaxi",
      "Gringotts",
      "HooliCalendar",
      "HooliMail",
      "Files",
      "TableFind",
      "HangryDash",
      "LibreOffice",
      "LockedIn"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent picks a destination NEW to Michael (not Jamaica/Barbados/NYC/Philly)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent searches Cheskepdia for accommodations in the chosen destination",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent searches Dinoco for flights from AVP to the destination",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent scopes eTaxi airport transport",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent builds a day-by-day itinerary in LibreOffice",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent blocks the trip dates on HooliCalendar",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent sends a group HooliMail invite to Pam + Jim + Holly",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent pulls Gringotts to scope budget",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent posts a 'Big trip coming' LockedIn teaser",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent saves the trip plan under ~/Documents/Trips/<destination>.odt",
          "type": "llm_judge",
          "weight": 0.09
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f062",
    "app": "multi_app",
    "instruction": "I want to lead an office culture initiative. Draft my 'World's Best Workplace' proposal in LibreOffice. Stand up a SprintBoard board for execution. Post the rollout to HooliWork #random. Cross-post a 'culture push announcement' in any team-wide HooliChat group. Send a HooliMail to David Wallace requesting sponsor approval. Block the kickoff date on HooliCalendar. Scope a kickoff lunch via TableFind + HangryDash. Search Cheskepdia for a quarterly off-site venue. Pull Gringotts to scope the initiative budget. Post a public LockedIn 'leading culture change' update. Save the proposal under ~/Documents/Work/. Order culture-initiative swag (T-shirts, stickers) from HooliShop and capture the spend in SpeedTax as a current-quarter business-expense line.",
    "apps_involved": [
      "SprintBoard",
      "HooliMail",
      "HooliChat",
      "HooliWork",
      "LockedIn",
      "LibreOffice",
      "Cheskepdia",
      "TableFind",
      "HangryDash",
      "Gringotts",
      "HooliCalendar",
      "Files",
      "SpeedTax",
      "HooliShop"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent drafts the 'World's Best Workplace' proposal in LibreOffice AND saves it under ~/Documents/Work/",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent stands up a SprintBoard board for execution with at least 5 tasks",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent posts the rollout announcement in HooliWork #random",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent cross-posts a culture-push announcement in a HooliChat office/team group (e.g. Party Planning Committee or any team conversation)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent sends a HooliMail to David Wallace requesting sponsor approval",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent blocks the kickoff date on HooliCalendar",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent scopes a kickoff lunch via TableFind + HangryDash",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent searches Cheskepdia for a quarterly off-site venue",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent pulls Gringotts to scope the initiative budget AND posts a public LockedIn 'leading culture change' update backed by that budget context",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent orders culture-initiative swag from HooliShop and notes the spend as a current-quarter business-expense line in SpeedTax",
          "type": "llm_judge",
          "weight": 0.1
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f065",
    "app": "multi_app",
    "instruction": "Hypothetical: what if I get fired tomorrow? Compute my total runway from Gringotts checking + savings + BatBucks cash + at-cost portfolio + OddsMarket balance and save that runway as a LibreOffice Calc sheet (income vs burn-rate, monthly columns). Identify cuts to HangryDash, Dinoco upgrades, and any non-utility recurring charges in Gringotts. Polish my LockedIn for any urgent reach-outs. Email a small set of trusted friends (Pam, Jim, Holly) via HooliMail asking for any leads. Stand up a SprintBoard board for the 30/60/90 day action plan. Block recovery-focused calendar time on HooliCalendar. Post a 'between chapters' message in HooliChat Finer Things Club (private). Pull the most-recent SpeedTax return to confirm any tax refund coming. If my runway looks tight, scope a cheaper Cheskepdia interim place I could fall back on. Write the emergency playbook in LibreOffice and save everything under ~/Documents/Personal/.",
    "apps_involved": [
      "Gringotts",
      "BatBucks",
      "OddsMarket",
      "SpeedTax",
      "LockedIn",
      "HooliMail",
      "Cheskepdia",
      "Dinoco Airlines",
      "HangryDash",
      "SprintBoard",
      "LibreOffice Calc",
      "Files",
      "HooliCalendar",
      "HooliChat"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent computes total runway from Gringotts checking + savings + BatBucks cash + at-cost portfolio + OddsMarket balance",
          "type": "llm_judge",
          "weight": 0.12963
        },
        {
          "criterion": "Agent identifies specific cuts (HangryDash + Dinoco upgrades + non-utility recurring charges)",
          "type": "llm_judge",
          "weight": 0.111111
        },
        {
          "criterion": "Agent polishes LockedIn for urgent reach-outs",
          "type": "llm_judge",
          "weight": 0.074074
        },
        {
          "criterion": "Agent emails Pam + Jim + Holly via HooliMail asking for leads",
          "type": "llm_judge",
          "weight": 0.111111
        },
        {
          "criterion": "Agent stands up a SprintBoard 30/60/90 day action board with at least 6 tasks",
          "type": "llm_judge",
          "weight": 0.12037
        },
        {
          "criterion": "Agent blocks recovery time on HooliCalendar",
          "type": "llm_judge",
          "weight": 0.074074
        },
        {
          "criterion": "Agent posts a 'between chapters' message in HooliChat Finer Things Club",
          "type": "llm_judge",
          "weight": 0.074074
        },
        {
          "criterion": "Agent pulls the most-recent SpeedTax return for tax-refund context",
          "type": "llm_judge",
          "weight": 0.055556
        },
        {
          "criterion": "Agent scopes a cheaper Cheskepdia interim place if needed",
          "type": "llm_judge",
          "weight": 0.064815
        },
        {
          "criterion": "Agent writes the emergency playbook in LibreOffice and saves under ~/Documents/Personal/",
          "type": "llm_judge",
          "weight": 0.111111
        },
        {
          "criterion": "Agent saves the 'fired tomorrow' runway computation as a LibreOffice Calc sheet (income vs burn-rate, monthly columns) \u2014 not as free-form Writer prose",
          "type": "llm_judge",
          "weight": 0.074074
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f066",
    "app": "multi_app",
    "instruction": "I want to do a real digital declutter \u2014 30 days, kill anything I'm not actually using. Audit my Gringotts recurring charges and flag the deadweight. Skim my HooliMail inbox and unsubscribe from anything I haven't opened in 6 months. Archive any HooliChat DMs I haven't touched in a month. Clean up my Documents folder via Files. Drop any HooliCalendar events I keep declining. Review my LockedIn connections and identify 5 to politely disconnect. Audit my SprintBoard for stale tasks I'll never finish. Trim my OddsMarket watchlist. Save the whole declutter manifesto in LibreOffice. Then post a 'I just dumped 30 things from my digital life' update on LockedIn.",
    "apps_involved": [
      "Gringotts",
      "HooliMail",
      "HooliChat",
      "Files",
      "HooliCalendar",
      "LockedIn",
      "SprintBoard",
      "OddsMarket",
      "LibreOffice",
      "HooliWork"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent identifies the deadweight Gringotts recurring charges (with payee + amount) and flags 3+ for cancel-by-hand",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent skims HooliMail inbox and names at least 3 specific senders to stop receiving from (the listing itself is sufficient; no unsubscribe UI is required)",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent lists inactive HooliChat DMs (no activity in 30 days) \u2014 noting them is sufficient; BuzzChat has no archive UI",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent cleans up ~/Documents via Files (moves loose files into Personal/ or Work/)",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent reviews HooliCalendar for declined / no-show events and proposes drops",
          "type": "llm_judge",
          "weight": 0.09
        },
        {
          "criterion": "Agent identifies 5 LockedIn connections to politely disconnect (with reason for each)",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent audits SprintBoard for stale tasks across all of Michael's projects and proposes closures",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent trims the OddsMarket watchlist (removes positions Michael no longer tracks)",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent saves the declutter manifesto in LibreOffice under ~/Documents/Personal/",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent posts the '30 things dumped' update on LockedIn",
          "type": "llm_judge",
          "weight": 0.07
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f070",
    "app": "multi_app",
    "instruction": "Jamaica trip is coming up soon and I have to actually prep this time. Pull the Sandals Montego Bay confirmation from Cheskepdia and my Dinoco Jamaica flight, schedule eTaxi for the wee-hours-of-departure-day airport ride, order travel essentials from HooliShop (sunscreen, beach reads, swim trunks I refuse to pack), restock pantry via Kwik-E-Mart for the day-I-get-back, schedule a final HangryDash dinner the night before so I don't have to cook, set my HooliCalendar Jamaica block + reminders, draft a pre-trip OOO HooliMail template, post a 'going dark for a week' tease in HooliChat Finer Things Club + a similar HooliWork #general note, write a LockedIn 'recharging' post scheduled for departure day, file a Jamaica trip prep checklist in LibreOffice under ~/Documents/Trips/, and pull Gringotts to confirm enough cash + check that my BatBucks portfolio doesn't need rebalancing while I'm gone.",
    "apps_involved": [
      "Cheskepdia",
      "Dinoco Airlines",
      "eTaxi",
      "HooliShop",
      "Kwik-E-Mart",
      "HangryDash",
      "HooliCalendar",
      "HooliMail",
      "HooliChat",
      "HooliWork",
      "LockedIn",
      "LibreOffice",
      "Gringotts",
      "BatBucks"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent surfaces a Sandals/Montego Bay Jamaica booking from Cheskepdia (exact confirmation ID not required \u2014 any Sandals Jamaica reservation in the seed satisfies)",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent surfaces Michael's AVP-to-Jamaica (MBJ) flight info from Dinoco (exact flight number not required \u2014 the Jamaica-bound itinerary in the Dinoco trips/bookings list satisfies)",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent opens eTaxi and schedules (or attempts to schedule) an airport pickup for Jamaica departure morning; accept if the agent navigates to the scheduling screen and sets the destination + time even if the departure date has already passed on eval date (eTaxi rejects past timestamps with a validation error)",
          "type": "llm_judge",
          "weight": 0.09
        },
        {
          "criterion": "Agent orders travel essentials from HooliShop (sunscreen / beach reads / swim trunks)",
          "type": "llm_judge",
          "weight": 0.07
        },
        {
          "criterion": "Agent orders pantry restock via Kwik-E-Mart for the day-I-get-back",
          "type": "llm_judge",
          "weight": 0.07
        },
        {
          "criterion": "Agent opens HangryDash and places or attempts to place a dinner order for the night before Jamaica departure; an ASAP order is acceptable if the scheduled date has already passed on eval date (HangryDash rejects past scheduledFor timestamps)",
          "type": "llm_judge",
          "weight": 0.07
        },
        {
          "criterion": "Agent sets a HooliCalendar Jamaica block with reminders",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent drafts a pre-trip OOO HooliMail template and saves it as a draft",
          "type": "llm_judge",
          "weight": 0.08
        },
        {
          "criterion": "Agent posts a 'going dark for a week' message in HooliChat Finer Things Club",
          "type": "llm_judge",
          "weight": 0.07
        },
        {
          "criterion": "Agent posts a similar 'going dark' note in HooliWork #general",
          "type": "llm_judge",
          "weight": 0.06
        },
        {
          "criterion": "Agent schedules / drafts a LockedIn 'recharging' post for departure day",
          "type": "llm_judge",
          "weight": 0.07
        },
        {
          "criterion": "Agent saves the trip-prep checklist in LibreOffice under ~/Documents/Trips/",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent pulls Gringotts checking + BatBucks portfolio to confirm cash + no rebalance needed",
          "type": "llm_judge",
          "weight": 0.08
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f071",
    "app": "multi_app",
    "instruction": "Emma's coming up to Scranton next month for a college-visit weekend (she's looking at U Scranton). I want to make this perfect. Book her a Cheskepdia stay near campus, reserve us a TableFind dinner Friday night + Saturday brunch, order welcome-snacks via HangryDash for her first night, pre-fill her Kwik-E-Mart pickup with stuff she likes, Zelle her some spending money via Gringotts, block the whole weekend on my HooliCalendar, send her a confirming HooliMail with the itinerary, give Holly a heads-up via HooliChat that I'm taking PTO, write Emma a 'welcome to Scranton' card in LibreOffice and save under ~/Documents/Personal/, and post a public 'proud big-brother moment' update on LockedIn (Emma is my younger sister).",
    "apps_involved": [
      "Cheskepdia",
      "TableFind",
      "eTaxi",
      "HangryDash",
      "Kwik-E-Mart",
      "Gringotts",
      "HooliCalendar",
      "HooliMail",
      "HooliChat",
      "LibreOffice",
      "Files",
      "LockedIn"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent books a Cheskepdia stay near U Scranton (University of Scranton) for Emma's visit weekend; accept any Scranton-area listing (apartment, studio, or hotel) returned by the Cheskepdia search",
          "type": "llm_judge",
          "weight": 0.125
        },
        {
          "criterion": "Agent reserves a TableFind dinner Friday night + Saturday brunch (2 reservations)",
          "type": "llm_judge",
          "weight": 0.125
        },
        {
          "criterion": "Agent orders welcome-snacks via HangryDash for Emma's first night",
          "type": "llm_judge",
          "weight": 0.09
        },
        {
          "criterion": "Agent pre-fills an Emma-friendly Kwik-E-Mart pickup",
          "type": "llm_judge",
          "weight": 0.09
        },
        {
          "criterion": "Agent Zelles Emma spending money via Gringotts with a clear memo",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent blocks the whole weekend on HooliCalendar (PTO)",
          "type": "llm_judge",
          "weight": 0.09
        },
        {
          "criterion": "Agent sends Emma a confirming HooliMail with the full itinerary",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent gives Holly (or any Dunder Mifflin coworker DM/office group) a heads-up via HooliChat about taking PTO that weekend",
          "type": "llm_judge",
          "weight": 0.07
        },
        {
          "criterion": "Agent writes a 'welcome to Scranton' card in LibreOffice and saves under ~/Documents/Personal/",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent posts a public family/family-pride update on LockedIn for Emma's visit (e.g. 'proud big-brother moment', 'proud sibling moment', 'proud of my sister', etc. \u2014 accept any phrasing acknowledging Emma is family and the visit is a positive moment)",
          "type": "llm_judge",
          "weight": 0.09
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f074",
    "app": "multi_app",
    "instruction": "I'm scoping a side-consultancy: 'Somehow I Manage Consulting' \u2014 paper-distributor turnaround advisory for small regional offices. Build me a real go-to-market packet. From BatBucks confirm I have working capital. From OddsMarket pull any market sentiment I can lean on. From SpeedTax confirm whether running this on the side as a 1099 alongside my Dunder Mifflin W-2 creates any tax weirdness. Pull my LockedIn analytics + post engagement to see if my brand has any reach to lean on. Identify 5 LockedIn warm leads (regional managers at competing paper firms). Draft a 'free 30-minute consult' HooliMail outreach template (saved as draft). Search Cheskepdia for a small day-rate meeting space in downtown Scranton for client meetings. Scope Dinoco for monthly day-trips to Philly + Wilkes-Barre for client visits. Plan eTaxi for those trips. Pre-fill HooliShop for a few branded notebooks + pens to bring to client meetings (low budget). Schedule a HangryDash client-lunch order option for the first booked meeting. Reserve TableFind for a discovery dinner with a candidate client. Block a HooliCalendar 'consulting hours' slot every Tuesday + Thursday evening. Stand up a SprintBoard 'SIM Consulting' tracker. Coordinate a HooliChat soft-launch in Finer Things Club + HooliWork #general announcement. Save the full go-to-market packet in LibreOffice under ~/Documents/Personal/sim_consulting/. Post the 'Somehow I Manage Consulting now open' LockedIn launch.",
    "apps_involved": [
      "BatBucks",
      "OddsMarket",
      "SpeedTax",
      "LockedIn",
      "HooliMail",
      "Cheskepdia",
      "Dinoco Airlines",
      "eTaxi",
      "HooliShop",
      "HangryDash",
      "TableFind",
      "HooliCalendar",
      "SprintBoard",
      "HooliChat",
      "HooliWork",
      "LibreOffice",
      "Files"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent confirms working capital from BatBucks (cash + at-cost portfolio), pulls OddsMarket sentiment relevant to regional paper distributors (or notes none exist), AND checks SpeedTax to call out any 1099-side-business tax implications alongside the Dunder Mifflin W-2",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent pulls LockedIn analytics / post engagement to gauge brand reach AND identifies 5 warm leads by name for consultancy outreach (plausible business-role profiles; explicit 'competing paper firm' tagging is not required)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent drafts a 'free 30-minute consult' HooliMail outreach template (saved as draft, NOT broadcast)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent searches Cheskepdia for a small day-rate meeting space in downtown Scranton",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent plans travel for client trips: Dinoco for monthly day-trips to Philadelphia (PHL is a seeded AVP route) AND eTaxi for local Wilkes-Barre ground trips (~20-min drive from Scranton, not a Dinoco route) or for same-day airport transfers",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent pre-fills HooliShop for branded notebooks + pens AND schedules a HangryDash client-lunch order option",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent reserves TableFind for a candidate-client discovery dinner",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent blocks 'consulting hours' Tuesday + Thursday evenings on HooliCalendar",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent stands up a 'SIM Consulting' SprintBoard tracker with at least 4 initial tasks",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent coordinates the soft-launch in HooliChat Finer Things Club AND HooliWork #general, saves the full go-to-market packet in LibreOffice under ~/Documents/Personal/sim_consulting/, AND posts the LockedIn 'Somehow I Manage Consulting now open' launch",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "long_horizon-f075",
    "app": "multi_app",
    "instruction": "I'm seriously thinking about a 3-month sabbatical to finish 'Somehow I Manage' and shoot the Threat Level Midnight sequel. I need to know if I can actually pull this off without my life falling apart. Compute my 3-month survival runway from Gringotts checking + savings + BatBucks at-cost portfolio + OddsMarket balance. Pull SpeedTax to confirm any tax refund cushion. Block the 3-month sabbatical window on HooliCalendar. Stand up a SprintBoard handoff board so my Team Morale + Diversity Day + Movie Monday projects don't die. Coordinate the away-message rollout across HooliMail (out-of-office draft to anyone who emails me), HooliChat (Finer Things Club + one more seeded social group I'm actually in, like the Party Planning Committee or After Hours), and HooliWork (#sales + #general 'going dark' posts). Polish my LockedIn for the 'creative reset' announcement. Search Cheskepdia for a writing-retreat AirBnB. Scope Dinoco flights to the retreat city. Plan eTaxi for the airport. Stock up on retreat essentials via Kwik-E-Mart + HooliShop. Pre-schedule HangryDash deliveries for the first week. Reserve a TableFind send-off dinner in Scranton before departure (TableFind only covers Scranton restaurants). Save the full sabbatical plan as a LibreOffice document under ~/Documents/Personal/sabbatical/ in Files. Post the public LockedIn 'taking a creative sabbatical' announcement (scheduled for the day before I leave).",
    "apps_involved": [
      "Gringotts",
      "BatBucks",
      "OddsMarket",
      "SpeedTax",
      "HooliCalendar",
      "SprintBoard",
      "HooliMail",
      "HooliChat",
      "HooliWork",
      "LockedIn",
      "Cheskepdia",
      "Dinoco Airlines",
      "eTaxi",
      "Kwik-E-Mart",
      "HooliShop",
      "HangryDash",
      "TableFind",
      "LibreOffice",
      "Files"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent computes 3-month runway from Gringotts checking + savings + BatBucks at-cost portfolio + OddsMarket balance AND pulls the most-recent SpeedTax return status for any tax-refund cushion context",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent blocks the 3-month sabbatical window on HooliCalendar",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent stands up a SprintBoard handoff board with at least 6 tasks across Team Morale + Diversity Day + Movie Monday so they don't die during the leave",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent drafts a HooliMail out-of-office message and saves it as a draft",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent posts 'going dark' heads-ups in HooliChat Finer Things Club AND a second seeded team group (e.g. Party Planning Committee or Dundie Awards Planning)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent posts 'going dark' notes in HooliWork #sales AND #general",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent searches Cheskepdia for a writing-retreat AirBnB and shortlists 1-2 candidates",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent plans airport travel: Dinoco flights to the retreat city AND eTaxi for airport pickup + dropoff",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent arranges retreat-city logistics: stocks essentials via Kwik-E-Mart + HooliShop, pre-schedules HangryDash deliveries for the first week, AND reserves a TableFind send-off dinner in Scranton before departure (TableFind only covers Scranton)",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent saves the full sabbatical plan as a LibreOffice document under ~/Documents/Personal/sabbatical/ AND polishes / drafts a 'creative reset' LockedIn announcement (saved or scheduled for day-before-departure)",
          "type": "llm_judge",
          "weight": 0.1
        }
      ]
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "aggregation-f023",
    "app": "oddsmarket",
    "instruction": "Give me a summary of my OddsMarket book in LibreOffice Calc: balance, total invested, profit or loss (split realized vs unrealized if distinguishable), and every open position.",
    "apps_involved": [
      "OddsMarket"
    ],
    "category": "aggregation",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reports both the OddsMarket account cash balance and total invested as specific dollar figures",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent reports P&L \u2014 split realized vs unrealized if distinguishable, else reports net_pnl with the caveat that all current positions are unrealized",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent lists every active open position with title, side, and shares",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent produces a LibreOffice Calc spreadsheet containing the OddsMarket book summary (balance, total invested, P&L, and open positions), as requested in the instruction",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent's OddsMarket book summary lists only the active open positions and does not invent any closed positions",
          "type": "llm_judge",
          "weight": 0.1
        },
        {
          "criterion": "Agent frames the summary with an explicit losing-vs-winning characterization (e.g., calls out which positions are underwater versus up) rather than presenting a neutral data dump",
          "type": "llm_judge",
          "weight": 0.05
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "cua_only-f008",
    "app": "oddsmarket",
    "instruction": "Go into OddsMarket, find my worst-losing position, hit Close, and type CONFIRM in the modal when it pops up.",
    "apps_involved": [
      "OddsMarket"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent identifies a worst-losing open position in OddsMarket with significant unrealized loss (either largest absolute dollar loss >= $10, OR largest percentage drawdown >= 40%) and explains the choice; both Office Reboot (largest $ loss) and GME (largest % drawdown) are acceptable with reasoning before clicking Close",
          "type": "llm_judge",
          "weight": 0.207692
        },
        {
          "criterion": "Agent opens the Close confirmation modal",
          "type": "llm_judge",
          "weight": 0.173077
        },
        {
          "criterion": "Agent types the literal string CONFIRM into the modal input",
          "type": "llm_judge",
          "weight": 0.173077
        },
        {
          "criterion": "Agent confirms the OddsMarket position is closed and removed from the open positions list after typing CONFIRM in the modal",
          "type": "llm_judge",
          "weight": 0.138462
        },
        {
          "criterion": "Agent opens OddsMarket at localhost:3003 and navigates into the open positions list before identifying a worst-losing position (either largest $ loss or largest % drawdown is acceptable)",
          "type": "llm_judge",
          "weight": 0.153846
        },
        {
          "criterion": "Agent clicks the Close button on the chosen worst-losing OddsMarket position (Office Reboot or GME, with reasoning) to launch the CONFIRM modal before typing into the modal input",
          "type": "llm_judge",
          "weight": 0.153846
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f005",
    "app": "oddsmarket",
    "instruction": "Clean up my OddsMarket account: exit any YES positions that are more than 10% underwater from cost basis (per-position P&L% < -10% \u2014 if no position is below that threshold, say so and skip the exit step), and put any freed-up money back into my existing NBC Office reboot/revival position (the one already in my open positions \u2014 that's the one I actually have conviction on). If the Office reboot position itself qualifies for the >10%-underwater exit, the reinvestment step is skipped \u2014 exit it and report the freed cash without reinvesting. Then add the markets I'm still holding to my watchlist so I can keep an eye on them.",
    "apps_involved": [
      "OddsMarket"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent enumerates every open YES position with its P&L (report whatever the live count is; no assumed count)",
          "type": "llm_judge",
          "weight": 0.155
        },
        {
          "criterion": "Agent identifies any OddsMarket YES positions whose per-position P&L% is below -10% (more than 10% underwater from cost basis) that should be exited, OR explicitly reports 'no positions are more than 10% underwater' if every position has P&L% >= -10%",
          "type": "llm_judge",
          "weight": 0.155
        },
        {
          "criterion": "Agent sells/exits any OddsMarket YES positions whose P&L% < -10% in the identification step (no exit is acceptable IF the agent correctly determined none qualify under the -10% threshold)",
          "type": "llm_judge",
          "weight": 0.185
        },
        {
          "criterion": "Agent buys additional YES shares in Michael's existing NBC Office reboot/revival position (already in his open positions) with the freed balance from the closed positions (no-buy is acceptable IF the agent correctly skipped the exit step because no position was below the -10% threshold; no-buy is also acceptable IF the reinvestment target itself qualified for the >10%-underwater exit and the agent correctly exited it and explains the reasoning \u2014 in that case the agent should report freed cash without reinvesting)",
          "type": "llm_judge",
          "weight": 0.205
        },
        {
          "criterion": "Agent adds the remaining (un-exited) open positions to the OddsMarket watchlist",
          "type": "llm_judge",
          "weight": 0.165
        },
        {
          "criterion": "Quality: state persists (any closed positions are off the open list, any new Office reboot shares are visible, watchlist contains the survivors)",
          "type": "llm_judge",
          "weight": 0.135
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f006",
    "app": "oddsmarket",
    "instruction": "Go into OddsMarket and review my open positions. Take the three biggest by share count and drop them into my watchlist so I can track them more carefully (if any are tied at #3, break the tie by highest cost basis, then alphabetically by ticker). Then write me a short P&L summary in chat or notes covering each market's cost basis, shares, and running profit or loss.",
    "apps_involved": [
      "OddsMarket"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent identifies the top 3 OddsMarket open positions by share count (by sorting the live positions list \u2014 names will track whatever positions are seeded; if positions tie at the boundary, agent breaks the tie by higher cost basis or alphabetical ticker)",
          "type": "llm_judge",
          "weight": 0.215
        },
        {
          "criterion": "Agent adds the top 3 positions by share count to the OddsMarket watchlist (any reasonable tiebreak at the boundary is acceptable)",
          "type": "llm_judge",
          "weight": 0.215
        },
        {
          "criterion": "Agent enumerates all open positions with cost basis and shares",
          "type": "llm_judge",
          "weight": 0.155
        },
        {
          "criterion": "Agent computes running P&L per market (positive or negative) and reports it alongside the cost basis and shares",
          "type": "llm_judge",
          "weight": 0.205
        },
        {
          "criterion": "Agent delivers the P&L summary covering every open market in a single readable list (chat-paste-ready or saved note)",
          "type": "llm_judge",
          "weight": 0.155
        },
        {
          "criterion": "Agent verifies the OddsMarket watchlist contains the 3 picks after reload of the page",
          "type": "llm_judge",
          "weight": 0.055
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "retrieval-f017",
    "app": "oddsmarket",
    "instruction": "Pull up my OddsMarket bets. I want to know what positions I have open, how much I've put in total, and my current profit or loss.",
    "apps_involved": [
      "OddsMarket"
    ],
    "category": "retrieval",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens OddsMarket and lists all currently-open positions",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent lists Michael's open OddsMarket positions with each market title or ticker and share/side detail, not a generic summary",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent reports the total invested amount and the current PnL computed from the live OddsMarket positions",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent's OddsMarket bets answer clearly covers all three asks (open positions, total invested, current PnL) and cites the OddsMarket source",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent names the most speculative bets among Michael's live OddsMarket positions (e.g., lowest-probability YES bets, largest unrealized-loss positions) rather than only listing positions uniformly",
          "type": "llm_judge",
          "weight": 0.05
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "personal_lookup"
  },
  {
    "id": "cua_only-f002",
    "app": "speedtax",
    "instruction": "I started this year's tax return on SpeedTax but never finished. Walk through the in-progress current-year return \u2014 review my wages and any freelance/1099 income, look over the deductions section, and click Continue all the way through to the review step (step 6) so I can see what I'd actually owe or get back. Don't submit it yet, just get it ready for me to eyeball.",
    "apps_involved": [
      "SpeedTax"
    ],
    "category": "cua_only",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens SpeedTax and identifies the current-year unfiled return for the persona (labeled 'In progress' or 'Ready for review' in the UI), not a prior-year filed return",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent reviews the wages step (step 2) and reports the W-2 wage amount loaded from the persona's seeded W-2 documents",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent reviews the freelance/1099 income step (step 3) and notes whether any 1099 income was seeded for the persona, instead of inventing a number",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent reviews the deductions step (step 4) and reads any seeded deduction inputs available (charitable is the canonical seeded category; mortgage interest, state/local tax, and medical may or may not have values populated \u2014 partial coverage suffices, and flagging any uncovered categories is also acceptable) rather than skipping past it",
          "type": "llm_judge",
          "weight": 0.16
        },
        {
          "criterion": "Agent advances the return through Continue / Save & Continue actions until SpeedTax shows the review step (step 6) as the active step",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent leaves the return at the review step without submitting or filing it (status is 'review' or 'in_progress', filed_at is null), matching the persona's request to get it ready to eyeball",
          "type": "llm_judge",
          "weight": 0.14
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "aggregation-f003",
    "app": "speedtax",
    "instruction": "Pull up my prior-year tax returns on SpeedTax and tell me my combined federal + state refund across the most recent filed years. I want one number that says 'over the last few years you've gotten back X total.'",
    "apps_involved": [
      "SpeedTax"
    ],
    "category": "aggregation",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent restricts the rollup to SpeedTax returns whose status is 'filed' and tax_year is strictly less than the current tax year, excluding any in-progress current-year return",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent reads both federal_refund_amount and state_refund_amount from each filed prior-year return rather than reading only one of the two refund fields",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent sums federal+state refunds across the filed prior-year returns and reports a single combined total alongside the count of prior years included",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent answers the persona's 'over the last few years' framing with a relative-year description (e.g. 'across the most recent N filed years') rather than hardcoding absolute calendar years",
          "type": "llm_judge",
          "weight": 0.2
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "retrieval-f003",
    "app": "speedtax",
    "instruction": "On SpeedTax, find my W-2 from last year's filed return \u2014 the most recent prior-year one \u2014 and tell me the wages amount on it. Just the number from the W-2, not the refund.",
    "apps_involved": [
      "SpeedTax"
    ],
    "category": "retrieval",
    "difficulty": "easy",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent picks the most recent prior-year filed return on SpeedTax (status='filed' and tax_year strictly less than the current tax year, choosing the highest such tax_year) and not the in-progress current-year return",
          "type": "llm_judge",
          "weight": 0.3
        },
        {
          "criterion": "Agent reads the W-2 document on the prior-year return \u2014 either by opening the wages step UI, navigating to the filed-return overview page, or via /api/returns/<id> documents \u2014 instead of inferring wages from the refund amount",
          "type": "llm_judge",
          "weight": 0.35
        },
        {
          "criterion": "Agent reports the W-2 wages amount as a single dollar figure for that prior-year W-2 and does not return the refund amount instead",
          "type": "llm_judge",
          "weight": 0.35
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "personal_lookup"
  },
  {
    "id": "cua_only-f003",
    "app": "sprintboard",
    "instruction": "Pop open my Diversity Day Planning 2026 board on SprintBoard and drag a card out of the Todo column into In Progress \u2014 just one card, whichever you think is the most sensible next step for Diversity Day prep.",
    "apps_involved": [
      "SprintBoard"
    ],
    "category": "situated_action",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens SprintBoard and navigates to the Diversity Day Planning 2026 project",
          "type": "llm_judge",
          "weight": 0.140741
        },
        {
          "criterion": "Agent picks a card from the Todo column and performs a left_click_drag gesture to In Progress",
          "type": "llm_judge",
          "weight": 0.31661
        },
        {
          "criterion": "Agent confirms the SprintBoard card appears in the In Progress column after the drag release on the Diversity Day Planning 2026 board",
          "type": "llm_judge",
          "weight": 0.246353
        },
        {
          "criterion": "Agent locates a specific task card sitting in the Todo column of the Diversity Day Planning 2026 SprintBoard before initiating the drag to In Progress",
          "type": "llm_judge",
          "weight": 0.296296
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "cua_only-f017",
    "app": "sprintboard",
    "instruction": "Pop open my Team Morale Q2 board on SprintBoard and move three different cards forward: one from Todo to In Progress, one from In Progress to In Review, and one from In Review to Done.",
    "apps_involved": [
      "SprintBoard"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens the Team Morale Initiative Q2 project in SprintBoard before moving any cards across columns",
          "type": "llm_judge",
          "weight": 0.12766
        },
        {
          "criterion": "Agent executes 3 distinct left_click_drag gestures on 3 different SprintBoard cards in the Team Morale Q2 board",
          "type": "llm_judge",
          "weight": 0.269149
        },
        {
          "criterion": "Agent moves the 3 SprintBoard cards forward into different Team Morale Q2 columns (Todo->In Progress, In Progress->In Review, In Review->Done)",
          "type": "llm_judge",
          "weight": 0.191489
        },
        {
          "criterion": "Agent selects a task card from the Todo column and applies a move that places it in the In Progress lane on the Team Morale Q2 SprintBoard",
          "type": "llm_judge",
          "weight": 0.120567
        },
        {
          "criterion": "Agent picks a ticket out of the In Progress lane and updates its column so it sits in In Review on the Team Morale Q2 SprintBoard",
          "type": "llm_judge",
          "weight": 0.120567
        },
        {
          "criterion": "Agent chooses a ticket in the In Review lane and sets its column so it completes in the Done lane on the Team Morale Q2 SprintBoard",
          "type": "llm_judge",
          "weight": 0.120567
        },
        {
          "criterion": "Agent verifies all 3 SprintBoard cards are in their new Team Morale Q2 positions after the drags (re-reads the board or refreshes) before concluding",
          "type": "llm_judge",
          "weight": 0.05
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f025",
    "app": "sprintboard",
    "instruction": "Can you clean up my 'Team Morale Initiative Q2' board on SprintBoard? Look at the open tasks in the backlog. Pick three tasks and (re)assign them to reasonable owners from the team (Pam, Dwight, Jim, or similar). Then create a new sprint called 'Q2 Morale Week 1' with about eight story points pulled from the backlog, and apply a 'dundies-prep' label to those sprint tasks so we stay focused.",
    "apps_involved": [
      "SprintBoard"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent navigates to the 'Team Morale Initiative Q2' SprintBoard project and reviews the open backlog tasks",
          "type": "llm_judge",
          "weight": 0.12
        },
        {
          "criterion": "Agent (re)assigns 3 backlog tasks under 'Team Morale Initiative Q2' to owners such as Pam, Dwight, or Jim",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent creates a new 'Q2 Morale Week 1' cycle and assigns 'Team Morale Initiative Q2' backlog tasks to it (cycles in SprintBoard are workspace-global; project linkage happens via the assigned tasks)",
          "type": "llm_judge",
          "weight": 0.17
        },
        {
          "criterion": "Agent pulls backlog tasks totaling ~8 story points into the 'Q2 Morale Week 1' sprint",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent applies the 'dundies-prep' label to the sprint tasks",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent persists the 'Q2 Morale Week 1' sprint, the assignments, and the 'dundies-prep' labels on the SprintBoard backlog tasks",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent opens SprintBoard and navigates into the 'Team Morale Initiative Q2' project before triaging the backlog tasks",
          "type": "llm_judge",
          "weight": 0.12
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f020",
    "app": "sprintboard",
    "instruction": "Start a new sprint cycle for the Diversity Day Planning 2026 project for the week starting three days ago and commit eight story points to the goal. Let's move some work.",
    "apps_involved": [
      "SprintBoard"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Cycle is created with start date three days ago AND 'Diversity Day Planning 2026' tasks are assigned to it (cycles in SprintBoard are workspace-global; project association happens via task-to-cycle assignment)",
          "type": "llm_judge",
          "weight": 0.329591
        },
        {
          "criterion": "Agent assigns approximately 8 story points worth of tasks (7-9 acceptable) into the new Diversity Day Planning 2026 sprint cycle",
          "type": "llm_judge",
          "weight": 0.274531
        },
        {
          "criterion": "Agent saves the new Diversity Day Planning 2026 cycle so the week of three days ago persists on SprintBoard",
          "type": "llm_judge",
          "weight": 0.16518
        },
        {
          "criterion": "Agent opens SprintBoard and navigates into the 'Diversity Day Planning 2026' project before starting the new sprint cycle for the week of three days ago",
          "type": "llm_judge",
          "weight": 0.230698
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "situated_action-f034",
    "app": "sprintboard",
    "instruction": "Pop open the Diversity Day Planning 2026 board on SprintBoard. The 'Order snacks representing every culture' task is sitting in todo and that one's mine. Move it into In Progress so the team sees I'm on it, and add 2 new sub-tasks under it (one for finalizing the cuisine list, one for confirming the HangryDash multi-cuisine order).",
    "apps_involved": [
      "SprintBoard"
    ],
    "category": "situated_action",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent navigates into the Diversity Day Planning 2026 SprintBoard board (not Team Morale, not Movie Monday)",
          "type": "llm_judge",
          "weight": 0.18
        },
        {
          "criterion": "Agent locates the 'Order snacks representing every culture' task that's currently in todo",
          "type": "llm_judge",
          "weight": 0.22
        },
        {
          "criterion": "Agent moves that task from todo into In Progress",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent adds 2 new sub-tasks: one for finalizing the cuisine list, one for confirming the HangryDash multi-cuisine order",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent confirms the status change + new sub-tasks persisted (refresh / re-read board state)",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "hard_app-f015",
    "app": "tablefind",
    "instruction": "Book me a new table at Sotto Mare on TableFind for the next upcoming Saturday at 7pm, party of four, and add a special-request note that one of us (Pam) needs a gluten-free option so they're ready for her. Then make me a new saved in LibreOffice Writer list called 'Go-Tos' and drop Sotto Mare into it. When you're done, grab me the confirmation number.",
    "apps_involved": [
      "TableFind",
      "LibreOffice Writer"
    ],
    "category": "situated_action",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent opens TableFind and navigates to Sotto Mare before starting the new reservation",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent submits a new TableFind reservation for Sotto Mare for party of 4 at 19:00 on a Saturday \u2014 accepts either today (if Saturday with 7pm slot still bookable) or the next upcoming Saturday",
          "type": "llm_judge",
          "weight": 0.165
        },
        {
          "criterion": "Agent enters a 'gluten-free option for Pam' special-request note on the new Sotto Mare reservation before submitting",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent captures the confirmation number after the booking submits",
          "type": "llm_judge",
          "weight": 0.11
        },
        {
          "criterion": "Agent opens LibreOffice Writer and creates a new document (or uses an existing one) to start a 'Go-Tos' restaurant list",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent adds Sotto Mare to the 'Go-Tos' list in the LibreOffice Writer document",
          "type": "llm_judge",
          "weight": 0.14
        },
        {
          "criterion": "Agent saves the LibreOffice Writer 'Go-Tos' document and confirms the new Sotto Mare TableFind reservation persists after navigating away",
          "type": "llm_judge",
          "weight": 0.105
        },
        {
          "criterion": "Agent reports back the confirmation number, party size, time, and the gluten-free Pam note as a single end-of-task summary",
          "type": "llm_judge",
          "weight": 0.09
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "multi_step_orchestration"
  },
  {
    "id": "hard_app-f016",
    "app": "tablefind",
    "instruction": "Book me Sotto Mare for ten people for the Dundies after-party next Friday. While you're in TableFind, pull up three other Scranton-area restaurants as a fallback plan in case Sotto Mare is full, and favorite the top two. Once that's done, confirm the Sotto Mare reservation with a note that it's the Dundies after-party so they can prep the vibe.",
    "apps_involved": [
      "TableFind"
    ],
    "category": "long_horizon",
    "difficulty": "hard",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent searches TableFind for Sotto Mare and checks 10-person availability",
          "type": "llm_judge",
          "weight": 0.134
        },
        {
          "criterion": "Agent books Sotto Mare for 10 with a 'Dundies after-party' special-request note",
          "type": "llm_judge",
          "weight": 0.197
        },
        {
          "criterion": "Agent searches TableFind for 3 other Scranton-area restaurants as fallbacks",
          "type": "llm_judge",
          "weight": 0.134
        },
        {
          "criterion": "Agent reviews the 3 Scranton fallback restaurants in TableFind one by one, examining details such as cuisine, rating, or neighborhood for each",
          "type": "llm_judge",
          "weight": 0.116
        },
        {
          "criterion": "Agent favorites the top 2 Scranton fallback restaurants on TableFind",
          "type": "llm_judge",
          "weight": 0.116
        },
        {
          "criterion": "Agent captures the Sotto Mare confirmation number",
          "type": "llm_judge",
          "weight": 0.107
        },
        {
          "criterion": "Agent verifies the Sotto Mare Dundies booking and the 2 Scranton favorites persist in TableFind after refresh",
          "type": "llm_judge",
          "weight": 0.089
        },
        {
          "criterion": "Agent confirms availability of a 10-person reservation slot at Sotto Mare on TableFind for the Dundies after-party night before placing the booking",
          "type": "llm_judge",
          "weight": 0.107
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "cua_only-f005",
    "app": "thunar",
    "instruction": "Drag that Jamaica boarding pass file out of my Downloads folder and into a proper Jamaica subfolder under my Trips docs (make the folder if it isn't already there). Rename it to something like jamaica_boarding_pass (keep whatever extension it has) so I can actually find it later.",
    "apps_involved": [
      "Thunar"
    ],
    "category": "situated_action",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent uses Thunar (not terminal mv) to move the file via left_click_drag",
          "type": "llm_judge",
          "weight": 0.295716
        },
        {
          "criterion": "Agent saves the Jamaica boarding pass file into ~/Documents/Trips/Jamaica/ renamed to jamaica_boarding_pass with whatever extension the seeded file uses (.pdf or .txt both accepted)",
          "type": "llm_judge",
          "weight": 0.25885
        },
        {
          "criterion": "Original copy is no longer in Downloads (it was moved, not copied)",
          "type": "llm_judge",
          "weight": 0.147914
        },
        {
          "criterion": "Agent opens Thunar and navigates into the ~/Downloads/ folder to surface the Jamaica boarding pass file before any drag",
          "type": "llm_judge",
          "weight": 0.099174
        },
        {
          "criterion": "Agent locates the Jamaica boarding pass file inside the Thunar Downloads view (the seeded file is Jamaica_Boarding_Pass_DN1562.pdf or boarding_pass_DN1562.txt \u2014 both are the Dinoco AVP\u2192MBJ flight file for Michael's Jamaica trip)",
          "type": "llm_judge",
          "weight": 0.099174
        },
        {
          "criterion": "Agent confirms or creates the Jamaica/ subfolder under ~/Documents/Trips/ in Thunar before moving the boarding pass file",
          "type": "llm_judge",
          "weight": 0.099174
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  },
  {
    "id": "aggregation-f008",
    "app": "vaultbank",
    "instruction": "If I keep going to improv class at the current pace, what's it going to cost me for a whole year? I'm reconsidering. Give me a Line graph chart of the overall cost in LibreOffice Calc.",
    "apps_involved": [
      "Gringotts"
    ],
    "category": "aggregation",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent identifies the Scranton Improv Workshop charges in the Gringotts transactions ledger and reports their average per-session amount as it appears in the seeded data",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent measures recent monthly improv class cadence (sessions per month) by counting seeded Scranton Improv charges over a recent window, rather than assuming a fixed schedule",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent frames the improv class yearly cost as a forward projection at the current pace, not a realized prior-year spend",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent shows the annualization math: (sessions per month from the ledger \u00d7 12) \u00d7 average per-session amount = projected yearly improv spend, with both figures derived from the seeded Gringotts data",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent produces a LibreOffice Calc spreadsheet containing at minimum the monthly improv cost data and a Line graph chart visualizing the cost trend, as requested",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent delivers the cadence, projected yearly improv spend, and the LibreOffice Calc file in a single answer",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "aggregation-f030",
    "app": "vaultbank",
    "instruction": "How much am I sending via Zelle each month and who's getting the money? Check the most recent 2 complete calendar months prior to today and rank the recipients. Give me the results of each month in a LibreOffice Calc Spreadsheet and comapre the totals.",
    "apps_involved": [
      "Gringotts"
    ],
    "category": "aggregation",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent reports the outbound Zelle total for the most recent complete calendar month prior to today, computed from the live Gringotts transactions ledger",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent repeats the computation for the calendar month before that (the 2nd most recent complete calendar month prior to today) \u2014 the prior complete calendar month's Zelle outbound total from the live Gringotts transactions ledger",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent identifies the top recipient(s) of outbound Zelle across those two months (even if the data is sparse \u2014 a single top recipient is acceptable when the ledger has few outbound Zelle rows)",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent reports a ranked list of Zelle recipients alongside the two most-recent-complete-calendar-month outbound totals",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent produces a LibreOffice Calc spreadsheet containing each month's Zelle totals and a comparison of the two months' totals, as requested",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent delivers the two monthly Zelle totals, the recipient ranking, and the LibreOffice Calc file in a single answer",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "aggregation-f036",
    "app": "vaultbank",
    "instruction": "How much did I spend on improv classes during the last full calendar year? Add up every Scranton Improv class charge from that year (the outbound debits whose description matches 'Scranton Improv Workshop' or 'SCRANTON IMPROV ACADEMY' case-insensitively) \u2014 exclude any inbound payments from teaching (e.g. 'Scranton Improv Academy - 1099-NEC payment' or 'VENMO SCRANTON IMPROV ACADEMY' credits, those are teaching income, not class spend) (where Scranton Improv is the payee/merchant, not just a venue address in a transportation charge).",
    "apps_involved": [
      "Gringotts"
    ],
    "category": "aggregation",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent filters Gringotts to the seeded Scranton Improv class charges (outbound debits whose description matches either 'Scranton Improv Workshop' or 'SCRANTON IMPROV ACADEMY' case-insensitively) dated within the last full calendar year (excludes the year before and the current year); excludes any inbound teaching-income credits",
          "type": "llm_judge",
          "weight": 0.2
        },
        {
          "criterion": "Agent sums every seeded Scranton Improv outbound debit as it appears in the ledger in the last full calendar year (matching either 'Scranton Improv Workshop' or 'SCRANTON IMPROV ACADEMY' descriptions) to an annual total computed from the actual seeded ledger charges (not a fabricated estimate); excludes any inbound teaching-income credits such as 'Scranton Improv Academy - 1099-NEC payment' or 'VENMO SCRANTON IMPROV ACADEMY'",
          "type": "llm_judge",
          "weight": 0.25
        },
        {
          "criterion": "Agent reports the last-full-calendar-year transaction count alongside the annual dollar total",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent shows the individual Scranton Improv charge amounts as they appear in the ledger (whether they are the same unit amount or vary by session) and uses those values to explain the total",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent reports the total spend for the last full calendar year on Scranton Improv (no specific per-charge unit-cadence assertion required \u2014 flat per-session, biweekly, monthly, or mixed are all acceptable as long as the sum reconciles)",
          "type": "llm_judge",
          "weight": 0.15
        },
        {
          "criterion": "Agent delivers the last-full-calendar-year total and supporting count in a single answer",
          "type": "llm_judge",
          "weight": 0.1
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "aggregation_reporting"
  },
  {
    "id": "situated_action-f011",
    "app": "vaultbank",
    "instruction": "Can you knock $500 off my credit card from my checking? I'm trying to be responsible here.",
    "apps_involved": [
      "Gringotts"
    ],
    "category": "situated_action",
    "difficulty": "medium",
    "grading": {
      "type": "llm_judge",
      "rubrics": [
        {
          "criterion": "Agent completes the $500 payment from checking to the credit card in Gringotts and confirms the payment posted",
          "type": "llm_judge",
          "weight": 0.583139
        },
        {
          "criterion": "Agent cites the Gringotts checking account as the funding source and the credit card as the destination for the $500 payment",
          "type": "llm_judge",
          "weight": 0.166611
        },
        {
          "criterion": "Agent transfers exactly $500 (not a different amount) from checking to the credit card, matching Michael's stated request to knock $500 off",
          "type": "llm_judge",
          "weight": 0.083639
        },
        {
          "criterion": "Agent opens Gringotts, navigates into the credit card account view, and captures the displayed credit card balance before initiating the $500 payment",
          "type": "llm_judge",
          "weight": 0.166611
        }
      ],
      "_original_grading_type": "hybrid"
    },
    "category_v2": "bounded_action"
  }
]