Understanding Costs and Speed for Merge¶
Every data engineer has faced the challenge: you have two tables that should join, but the keys don't quite match. Company names are spelled differently. Subsidiaries need to map to parents. Typos have crept in. Abbreviations vary.
The everyrow.merge() operation solves this by using a cost-optimized cascade of matching strategies:
| Strategy | Cost | Speed | Example |
|---|---|---|---|
| Exact match | Free | Instant | "Apple Inc" → "Apple Inc" |
| Fuzzy match | Free | Fast | "Microsft Corp" → "Microsoft Corp" |
| LLM reasoning | ~$0.002/row | ~1s/row | "Instagram" → "Meta Platforms" |
| Web search | ~$0.01/row | ~5s/row | Obscure/stale data |
The key insight: most real-world matches are cheap or free. The expensive LLM-based matching only kicks in when simpler methods fail.
This notebook empirically tests these claims with increasing levels of matching difficulty, measuring actual costs and timing at each step.
# Setup: install everyrow if needed and configure API key
try:
import everyrow
except ImportError:
%pip install everyrow
import os
if "EVERYROW_API_KEY" not in os.environ:
os.environ["EVERYROW_API_KEY"] = "your-api-key-here" # Get one at everyrow.io
Setup¶
First, let's set up our imports and create helper functions for measuring costs and timing.
import asyncio
import time
from dataclasses import dataclass
from typing import Literal
from dotenv import load_dotenv
load_dotenv()
import pandas as pd
import numpy as np
from everyrow import create_session, get_billing_balance
from everyrow.ops import merge
@dataclass
class ExperimentResult:
"""Results from a merge experiment."""
name: str
rows: int
cost_dollars: float
duration_seconds: float
accuracy_pct: float | None = None
def __repr__(self):
acc = f", accuracy={self.accuracy_pct:.1f}%" if self.accuracy_pct is not None else ""
cost_per_row = self.cost_dollars / self.rows if self.rows > 0 else 0
return (f"ExperimentResult({self.name}: {self.rows} rows, "
f"${self.cost_dollars:.4f} (${cost_per_row:.5f}/row), "
f"{self.duration_seconds:.1f}s{acc})")
# Store all experiment results for final comparison
all_results: list[ExperimentResult] = []
async def measure_merge(
name: str,
task: str,
left_table: pd.DataFrame,
right_table: pd.DataFrame,
merge_on_left: str | None = None,
merge_on_right: str | None = None,
expected_matches: dict[str, str] | None = None,
use_web_search: Literal['auto', 'yes', 'no'] | None = None,
) -> tuple[pd.DataFrame, ExperimentResult]:
"""
Run a merge operation and measure its cost, duration, and accuracy.
Args:
name: Experiment name for logging
task: The merge task description
left_table: Left DataFrame (all rows preserved)
right_table: Right DataFrame to match from
merge_on_left: Column name in left table (optional)
merge_on_right: Column name in right table (optional)
expected_matches: Dict mapping left values to expected right values (for accuracy)
use_web_search: "auto", "yes", or "no"
Returns:
Tuple of (result DataFrame, ExperimentResult)
"""
# Measure billing before
balance_before = await get_billing_balance()
start_time = time.time()
# Run the merge inside a named session
async with create_session(name=name) as session:
print(f"Session URL: {session.get_url()}")
result = await merge(
task=task,
session=session,
left_table=left_table,
right_table=right_table,
merge_on_left=merge_on_left,
merge_on_right=merge_on_right,
use_web_search=use_web_search,
)
# Measure billing after
end_time = time.time()
await asyncio.sleep(60) # wait for billing to update
balance_after = await get_billing_balance()
cost = balance_before.current_balance_dollars - balance_after.current_balance_dollars
duration = end_time - start_time
# Calculate accuracy if expected matches provided
accuracy = None
if expected_matches and merge_on_left and merge_on_right:
correct = 0
total = len(expected_matches)
for left_val, expected_right in expected_matches.items():
row = result.data[result.data[merge_on_left] == left_val]
if len(row) > 0:
actual_right = row[merge_on_right].iloc[0]
if pd.notna(actual_right) and expected_right in str(actual_right):
correct += 1
accuracy = (correct / total) * 100 if total > 0 else None
exp_result = ExperimentResult(
name=name,
rows=len(left_table),
cost_dollars=cost,
duration_seconds=duration,
accuracy_pct=accuracy,
)
all_results.append(exp_result)
print(f"\n{exp_result}")
return result.data, exp_result
Experiment 1: Exact String Matches Only¶
Let's start with the simplest case: both tables use identical strings. This should be instant and free since the system can do a simple string comparison.
We'll create a realistic scenario: matching a list of Fortune 500 companies to their revenue data.
# Fortune 500-style company data with EXACT matching names
companies_exact = pd.DataFrame([
{"company": "Apple Inc.", "sector": "Technology"},
{"company": "Microsoft Corporation", "sector": "Technology"},
{"company": "Amazon.com Inc.", "sector": "Consumer Cyclical"},
{"company": "Alphabet Inc.", "sector": "Technology"},
{"company": "Meta Platforms Inc.", "sector": "Technology"},
{"company": "Tesla Inc.", "sector": "Consumer Cyclical"},
{"company": "NVIDIA Corporation", "sector": "Technology"},
{"company": "JPMorgan Chase & Co.", "sector": "Financial Services"},
{"company": "Johnson & Johnson", "sector": "Healthcare"},
{"company": "Visa Inc.", "sector": "Financial Services"},
])
revenue_exact = pd.DataFrame([
{"company_name": "Apple Inc.", "revenue_billions": 394},
{"company_name": "Microsoft Corporation", "revenue_billions": 211},
{"company_name": "Amazon.com Inc.", "revenue_billions": 574},
{"company_name": "Alphabet Inc.", "revenue_billions": 307},
{"company_name": "Meta Platforms Inc.", "revenue_billions": 134},
{"company_name": "Tesla Inc.", "revenue_billions": 96},
{"company_name": "NVIDIA Corporation", "revenue_billions": 61},
{"company_name": "JPMorgan Chase & Co.", "revenue_billions": 158},
{"company_name": "Johnson & Johnson", "revenue_billions": 95},
{"company_name": "Visa Inc.", "revenue_billions": 32},
])
expected_exact = {row["company"]: row["company"] for _, row in companies_exact.iterrows()}
print(f"Left table: {len(companies_exact)} rows")
print(f"Right table: {len(revenue_exact)} rows")
companies_exact.head(3)
result_exact, stats_exact = await measure_merge(
name="Exact matches only",
task="Match companies by name. Names are identical in both tables.",
left_table=companies_exact,
right_table=revenue_exact,
merge_on_left="company",
merge_on_right="company_name",
expected_matches={c: c for c in companies_exact["company"]},
)
result_exact[["company", "sector", "company_name", "revenue_billions"]].head(5)
result_exact, stats_exact = await measure_merge(
name="Exact matches only",
task="Match companies by name. Names are identical in both tables.",
left_table=companies_exact,
right_table=revenue_exact.iloc[:-2],
merge_on_left="company",
merge_on_right="company_name",
expected_matches={c: c for c in companies_exact["company"].iloc[:-2]},
)
result_exact[["company", "sector", "company_name", "revenue_billions"]].head(5)
result_exact
As expected: zero cost for exact string matches. The cascade never needed to invoke LLM reasoning.
Experiment 2: Exact + Fuzzy Matches (Typos & Variations)¶
Real-world data is messy. Let's introduce realistic variations:
- Typos: "Microsft" instead of "Microsoft"
- Case differences: "APPLE INC" vs "Apple Inc."
- Missing punctuation: "Johnson Johnson" vs "Johnson & Johnson"
- Spacing issues: "JP Morgan" vs "JPMorgan"
These should all be handled by fuzzy matching, which is still free.
# Same companies but with realistic typos and variations
companies_fuzzy = pd.DataFrame([
{"company": "APPLE INC", "sector": "Technology"}, # Case difference
{"company": "Microsft Corporation", "sector": "Technology"}, # Typo
{"company": "Amazon Inc", "sector": "Consumer Cyclical"}, # Missing .com
{"company": "Alphabet", "sector": "Technology"}, # Missing Inc.
{"company": "Meta Platforms", "sector": "Technology"}, # Missing Inc.
{"company": "Telsa Inc.", "sector": "Consumer Cyclical"}, # Typo (Telsa)
{"company": "Nvidia Corp", "sector": "Technology"}, # Abbreviation
{"company": "JP Morgan Chase", "sector": "Financial Services"}, # Spacing
{"company": "Johnson Johnson", "sector": "Healthcare"}, # Missing &
{"company": "Visa", "sector": "Financial Services"}, # Missing Inc.
])
# Expected matches (left company -> right company_name)
expected_fuzzy = {
"APPLE INC": "Apple Inc.",
"Microsft Corporation": "Microsoft Corporation",
"Amazon Inc": "Amazon.com Inc.",
"Alphabet": "Alphabet Inc.",
"Meta Platforms": "Meta Platforms Inc.",
"Telsa Inc.": "Tesla Inc.",
"Nvidia Corp": "NVIDIA Corporation",
"JP Morgan Chase": "JPMorgan Chase & Co.",
"Johnson Johnson": "Johnson & Johnson",
"Visa": "Visa Inc.",
}
print("Sample variations:")
for left, right in list(expected_fuzzy.items())[:5]:
print(f" '{left}' → '{right}'")
result_fuzzy, stats_fuzzy = await measure_merge(
name="Exact + fuzzy (typos)",
task="Match companies by name. Handle typos, case differences, and minor variations.",
left_table=companies_fuzzy,
right_table=revenue_exact,
merge_on_left="company",
merge_on_right="company_name",
expected_matches=expected_fuzzy,
)
result_fuzzy[["company", "company_name", "revenue_billions"]]
Still zero (or near-zero) cost! Fuzzy string matching handles all these variations without needing LLM reasoning.
Experiment 3: Mostly Exact + Few LLM Matches (Semantic Relationships)¶
Now let's introduce cases that require semantic understanding:
- Subsidiaries: "Instagram" should match "Meta Platforms"
- Parent companies: "YouTube" should match "Alphabet"
- Acquisitions: "LinkedIn" should match "Microsoft"
- Regional names: "MSD" is Merck's name outside the US
These can't be solved by string matching alone—the LLM needs to know that Instagram is owned by Meta.
Hypothesis: With mostly exact matches and only a few semantic ones, costs should be minimal since only the semantic matches invoke the LLM.
# Mix of exact matches and semantic relationships
companies_semantic = pd.DataFrame([
# Exact matches (7 rows - should be free)
{"company": "Apple Inc.", "sector": "Technology"},
{"company": "Microsoft Corporation", "sector": "Technology"},
{"company": "Amazon.com Inc.", "sector": "Consumer Cyclical"},
{"company": "Tesla Inc.", "sector": "Consumer Cyclical"},
{"company": "NVIDIA Corporation", "sector": "Technology"},
{"company": "JPMorgan Chase & Co.", "sector": "Financial Services"},
{"company": "Visa Inc.", "sector": "Financial Services"},
# Semantic matches (3 rows - require LLM)
{"company": "Instagram", "sector": "Technology"}, # → Meta Platforms Inc.
{"company": "YouTube", "sector": "Technology"}, # → Alphabet Inc.
{"company": "WhatsApp", "sector": "Technology"}, # → Meta Platforms Inc.
])
expected_semantic = {
"Apple Inc.": "Apple Inc.",
"Microsoft Corporation": "Microsoft Corporation",
"Amazon.com Inc.": "Amazon.com Inc.",
"Tesla Inc.": "Tesla Inc.",
"NVIDIA Corporation": "NVIDIA Corporation",
"JPMorgan Chase & Co.": "JPMorgan Chase & Co.",
"Visa Inc.": "Visa Inc.",
"Instagram": "Meta Platforms Inc.",
"YouTube": "Alphabet Inc.",
"WhatsApp": "Meta Platforms Inc.",
}
print(f"Total rows: {len(companies_semantic)}")
print(f" - Exact matches expected: 7 (free)")
print(f" - Semantic matches expected: 3 (LLM required)")
result_semantic, stats_semantic = await measure_merge(
name="Mostly exact + semantic",
task="""Match companies. Note:
- Instagram and WhatsApp are owned by Meta Platforms
- YouTube is owned by Alphabet (Google's parent)
""",
left_table=companies_semantic,
right_table=revenue_exact,
merge_on_left="company",
merge_on_right="company_name",
expected_matches=expected_semantic,
)
result_semantic[["company", "company_name", "revenue_billions"]]
# Calculate estimated per-semantic-match cost
if stats_semantic.cost_dollars > 0:
semantic_matches = 3 # Instagram, YouTube, WhatsApp
cost_per_llm_match = stats_semantic.cost_dollars / semantic_matches
print(f"Cost per LLM match: ${cost_per_llm_match:.4f}")
print(f"Total cost for {semantic_matches} LLM matches: ${stats_semantic.cost_dollars:.4f}")
print(f"\nThe 7 exact matches were FREE.")
This demonstrates the cascade in action: 70% of rows matched for free (exact matches), while only 30% required LLM reasoning.
Experiment 4: Non-Trivial Matching (Breakdown by Match Type)¶
Let's test a more realistic scenario where we need to match pharmaceutical company subsidiaries and regional variations to their parent companies. This is a common real-world challenge in clinical trial data.
We'll create data that tests the full cascade:
- Exact matches: Identical names
- Fuzzy matches: Typos and variations
- LLM matches: Subsidiaries, regional names, abbreviations
# Clinical trial sponsors (left table)
trial_sponsors = pd.DataFrame([
# Exact matches (should be free)
{"sponsor": "Pfizer Inc.", "trial_count": 150},
{"sponsor": "Novartis AG", "trial_count": 120},
{"sponsor": "Sanofi S.A.", "trial_count": 100},
{"sponsor": "AstraZeneca PLC", "trial_count": 95},
# Fuzzy matches (should still be free)
{"sponsor": "Pfzer Inc", "trial_count": 5}, # Typo
{"sponsor": "NOVARTIS", "trial_count": 8}, # Case
{"sponsor": "Astra Zeneca", "trial_count": 12}, # Spacing
# LLM matches - subsidiaries and regional names
{"sponsor": "Genentech", "trial_count": 45}, # → Roche
{"sponsor": "MSD", "trial_count": 80}, # → Merck (regional name)
{"sponsor": "BMS", "trial_count": 60}, # → Bristol-Myers Squibb
{"sponsor": "AbbVie", "trial_count": 70}, # Was part of Abbott
{"sponsor": "Genzyme", "trial_count": 25}, # → Sanofi (acquired)
{"sponsor": "Medimmune", "trial_count": 20}, # → AstraZeneca
])
# Parent pharma companies (right table)
pharma_parents = pd.DataFrame([
{"company": "Pfizer Inc.", "hq_country": "USA", "market_cap_b": 250},
{"company": "Novartis AG", "hq_country": "Switzerland", "market_cap_b": 200},
{"company": "Roche Holding AG", "hq_country": "Switzerland", "market_cap_b": 280},
{"company": "Merck & Co.", "hq_country": "USA", "market_cap_b": 270},
{"company": "Bristol-Myers Squibb", "hq_country": "USA", "market_cap_b": 150},
{"company": "AbbVie Inc.", "hq_country": "USA", "market_cap_b": 260},
{"company": "Sanofi S.A.", "hq_country": "France", "market_cap_b": 130},
{"company": "AstraZeneca PLC", "hq_country": "UK", "market_cap_b": 220},
])
expected_pharma = {
"Pfizer Inc.": "Pfizer", "Novartis AG": "Novartis", "Sanofi S.A.": "Sanofi",
"AstraZeneca PLC": "AstraZeneca", "Pfzer Inc": "Pfizer", "NOVARTIS": "Novartis",
"Astra Zeneca": "AstraZeneca", "Genentech": "Roche", "MSD": "Merck",
"BMS": "Bristol-Myers", "AbbVie": "AbbVie", "Genzyme": "Sanofi",
"Medimmune": "AstraZeneca",
}
print(f"Total sponsor records: {len(trial_sponsors)}")
print(f"\nExpected match breakdown:")
print(f" - Exact matches: 4 rows (free)")
print(f" - Fuzzy matches: 3 rows (free)")
print(f" - LLM matches: 6 rows (charged)")
result_pharma, stats_pharma = await measure_merge(
name="Pharma non-trivial",
task="""Match clinical trial sponsors to their parent pharmaceutical company.
Key relationships to know:
- Genentech is a subsidiary of Roche
- MSD is Merck's name outside the United States
- BMS is the abbreviation for Bristol-Myers Squibb
- Genzyme was acquired by Sanofi
- MedImmune is a subsidiary of AstraZeneca
""",
left_table=trial_sponsors,
right_table=pharma_parents,
merge_on_left="sponsor",
merge_on_right="company",
expected_matches=expected_pharma,
)
result_pharma[["sponsor", "trial_count", "company", "hq_country", "market_cap_b"]]
# Analyze match type breakdown
exact_matches = 4
fuzzy_matches = 3
llm_matches = 6
total = exact_matches + fuzzy_matches + llm_matches
print("Match Type Breakdown:")
print(f" Exact matches: {exact_matches:2d} ({exact_matches/total*100:.0f}%) - FREE")
print(f" Fuzzy matches: {fuzzy_matches:2d} ({fuzzy_matches/total*100:.0f}%) - FREE")
print(f" LLM matches: {llm_matches:2d} ({llm_matches/total*100:.0f}%) - CHARGED")
print(f" ─────────────────────")
print(f" Total: {total:2d}")
print(f"\nFree matches: {(exact_matches + fuzzy_matches)/total*100:.0f}%")
print(f"Paid matches: {llm_matches/total*100:.0f}%")
if stats_pharma.cost_dollars > 0:
print(f"\nActual cost: ${stats_pharma.cost_dollars:.4f}")
print(f"Cost per LLM match: ${stats_pharma.cost_dollars/llm_matches:.4f}")
Even with complex pharmaceutical relationships, over half the matches were free. The cost scales with the number of rows requiring semantic understanding, not the total row count.
Experiment 5: LLM-Only Matching (No merge_on Parameters)¶
What happens when you don't specify which columns to match? The system must:
- Analyze both tables to guess which columns are relevant
- Use LLM reasoning for every row
This is more expensive but useful when:
- You're not sure which columns should match
- Multiple columns might be relevant
- The matching logic is complex
# Contact data without clear merge keys
contacts = pd.DataFrame([
{"name": "John Smith", "email": "jsmith@acme.com", "title": "VP Sales"},
{"name": "Sarah Johnson", "email": "sarah.j@techcorp.io", "title": "CTO"},
{"name": "Mike Chen", "email": "m.chen@globalinc.com", "title": "Director"},
{"name": "Emily Davis", "email": "emily@startup.co", "title": "CEO"},
{"name": "Tom Wilson", "email": "twilson@bigco.com", "title": "Manager"},
])
# Company data to match against
companies = pd.DataFrame([
{"company_name": "Acme Corporation", "domain": "acme.com", "industry": "Manufacturing"},
{"company_name": "TechCorp Solutions", "domain": "techcorp.io", "industry": "Software"},
{"company_name": "Global Industries Inc", "domain": "globalinc.com", "industry": "Consulting"},
{"company_name": "Startup Co", "domain": "startup.co", "industry": "Technology"},
{"company_name": "BigCo Enterprises", "domain": "bigco.com", "industry": "Finance"},
])
print("Contacts:")
print(contacts.to_string(index=False))
print("\nCompanies:")
print(companies.to_string(index=False))
# Run WITHOUT specifying merge_on columns
result_nokeys, stats_nokeys = await measure_merge(
name="LLM-only (no merge_on)",
task="""Match each contact to their company.
Use the email domain to identify which company each person works for.
For example, jsmith@acme.com works at Acme Corporation.
""",
left_table=contacts,
right_table=companies,
# Note: No merge_on_left or merge_on_right specified!
)
result_nokeys
# Compare: same data but WITH merge hints
result_withkeys, stats_withkeys = await measure_merge(
name="With merge_on hints",
task="""Match contacts to companies by email domain.""",
left_table=contacts,
right_table=companies,
merge_on_left="email",
merge_on_right="domain",
)
print(f"\nComparison:")
print(f" Without merge_on: ${stats_nokeys.cost_dollars:.4f}, {stats_nokeys.duration_seconds:.1f}s")
print(f" With merge_on: ${stats_withkeys.cost_dollars:.4f}, {stats_withkeys.duration_seconds:.1f}s")
if stats_nokeys.cost_dollars > 0 and stats_withkeys.cost_dollars > 0:
ratio = stats_nokeys.cost_dollars / stats_withkeys.cost_dollars
print(f"\n LLM-only is {ratio:.1f}x more expensive")
Takeaway: Providing merge_on hints significantly reduces costs when the matching columns are known.
Experiment 6: Scaling Analysis¶
How do costs scale as we increase:
- Number of rows (10 → 50 → 100 → 200)
- Content per row (more columns, longer text)
For this experiment, we'll generate synthetic data with controllable characteristics and measure the cost/time relationship.
def generate_company_data(n_rows: int, add_description: bool = False) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Generate synthetic company data for scaling tests.
Returns left_table (with variations) and right_table (canonical names).
Mix includes: 40% exact, 30% fuzzy, 30% semantic.
"""
base_companies = [
("Apple Inc.", "AAPL", "Technology"),
("Microsoft Corporation", "MSFT", "Technology"),
("Amazon.com Inc.", "AMZN", "E-commerce"),
("Alphabet Inc.", "GOOGL", "Technology"),
("Meta Platforms Inc.", "META", "Technology"),
("Tesla Inc.", "TSLA", "Automotive"),
("NVIDIA Corporation", "NVDA", "Technology"),
("JPMorgan Chase & Co.", "JPM", "Finance"),
("Johnson & Johnson", "JNJ", "Healthcare"),
("Visa Inc.", "V", "Finance"),
]
# Variations for left table
variations = {
"Apple Inc.": ["Apple Inc.", "APPLE INC", "Apple"], # exact, case, short
"Microsoft Corporation": ["Microsoft Corporation", "Microsft Corp", "MSFT"],
"Amazon.com Inc.": ["Amazon.com Inc.", "Amazon Inc", "AWS"], # exact, fuzzy, semantic
"Alphabet Inc.": ["Alphabet Inc.", "Alphabet", "Google"],
"Meta Platforms Inc.": ["Meta Platforms Inc.", "Meta Platforms", "Facebook"],
"Tesla Inc.": ["Tesla Inc.", "Telsa Inc", "Tesla Motors"],
"NVIDIA Corporation": ["NVIDIA Corporation", "Nvidia Corp", "GeForce"],
"JPMorgan Chase & Co.": ["JPMorgan Chase & Co.", "JP Morgan", "Chase Bank"],
"Johnson & Johnson": ["Johnson & Johnson", "Johnson Johnson", "J&J"],
"Visa Inc.": ["Visa Inc.", "Visa", "Visa Card"],
}
left_rows = []
for i in range(n_rows):
base = base_companies[i % len(base_companies)]
company_name = base[0]
var_list = variations[company_name]
# Cycle through: exact (40%), fuzzy (30%), semantic (30%)
var_idx = i % 3 # 0=exact, 1=fuzzy, 2=semantic
var_name = var_list[min(var_idx, len(var_list)-1)]
row = {
"company": var_name,
"record_id": f"REC-{i:04d}",
"sector": base[2],
}
if add_description:
row["description"] = f"Company record {i} for {var_name}. " * 5
left_rows.append(row)
right_rows = [
{"company_name": c[0], "ticker": c[1], "industry": c[2], "employees": (i+1)*10000}
for i, c in enumerate(base_companies)
]
return pd.DataFrame(left_rows), pd.DataFrame(right_rows)
# Test the generator
test_left, test_right = generate_company_data(10)
print("Sample left table:")
print(test_left.head())
6.1 Scaling with Number of Rows¶
Let's measure how costs grow as we increase row count.
row_counts = [10, 30, 50, 100]
scaling_results = []
for n_rows in row_counts:
left_df, right_df = generate_company_data(n_rows)
_, result = await measure_merge(
name=f"Scale test: {n_rows} rows",
task="""Match companies. Handle variations like:
- Google is Alphabet's main product
- Facebook is now Meta Platforms
- AWS is part of Amazon
- Chase Bank is part of JPMorgan
""",
left_table=left_df,
right_table=right_df,
merge_on_left="company",
merge_on_right="company_name",
)
scaling_results.append(result)
# Analyze scaling results
print("\n" + "="*60)
print("ROW SCALING ANALYSIS")
print("="*60)
print(f"{'Rows':<10} {'Cost':>10} {'Time (s)':>10} {'$/row':>12}")
print("-"*42)
for r in scaling_results:
cost_per_row = r.cost_dollars / r.rows if r.rows > 0 else 0
print(f"{r.rows:<10} ${r.cost_dollars:>8.4f} {r.duration_seconds:>10.1f} ${cost_per_row:>10.5f}")
# Check if cost scales linearly
if len(scaling_results) >= 2 and scaling_results[0].cost_dollars > 0:
first = scaling_results[0]
last = scaling_results[-1]
row_ratio = last.rows / first.rows
cost_ratio = last.cost_dollars / first.cost_dollars if first.cost_dollars > 0 else 0
print(f"\nScaling factor: {row_ratio:.0f}x rows → {cost_ratio:.1f}x cost")
if cost_ratio > 0:
print(f"Cost scales {'linearly' if 0.8 < cost_ratio/row_ratio < 1.2 else 'sub-linearly' if cost_ratio/row_ratio < 0.8 else 'super-linearly'}")
6.2 Scaling with Content per Row¶
Does adding more columns or longer text fields affect costs?
# Compare: minimal columns vs rich content
n_rows = 20
# Minimal content
left_minimal, right_minimal = generate_company_data(n_rows, add_description=False)
_, result_minimal = await measure_merge(
name=f"Minimal content ({n_rows} rows)",
task="Match companies. Google→Alphabet, Facebook→Meta, AWS→Amazon.",
left_table=left_minimal,
right_table=right_minimal,
merge_on_left="company",
merge_on_right="company_name",
)
# Rich content
left_rich, right_rich = generate_company_data(n_rows, add_description=True)
_, result_rich = await measure_merge(
name=f"Rich content ({n_rows} rows)",
task="Match companies. Google→Alphabet, Facebook→Meta, AWS→Amazon.",
left_table=left_rich,
right_table=right_rich,
merge_on_left="company",
merge_on_right="company_name",
)
print(f"\nContent comparison ({n_rows} rows):")
print(f" Minimal ({len(left_minimal.columns)} cols): ${result_minimal.cost_dollars:.4f}, {result_minimal.duration_seconds:.1f}s")
print(f" Rich ({len(left_rich.columns)} cols): ${result_rich.cost_dollars:.4f}, {result_rich.duration_seconds:.1f}s")
Summary: Cost & Performance Findings¶
Let's compile all our experimental results into a final comparison.
# Create summary DataFrame
summary_data = []
for r in all_results:
cost_per_row = r.cost_dollars / r.rows if r.rows > 0 else 0
summary_data.append({
"Experiment": r.name,
"Rows": r.rows,
"Cost ($)": f"${r.cost_dollars:.4f}",
"Time (s)": f"{r.duration_seconds:.1f}",
"$/Row": f"${cost_per_row:.5f}",
"Accuracy": f"{r.accuracy_pct:.0f}%" if r.accuracy_pct else "N/A",
})
summary_df = pd.DataFrame(summary_data)
print("\n" + "="*80)
print("COMPLETE EXPERIMENT SUMMARY")
print("="*80)
print(summary_df.to_string(index=False))
# Calculate key findings
total_cost = sum(r.cost_dollars for r in all_results)
total_rows = sum(r.rows for r in all_results)
total_time = sum(r.duration_seconds for r in all_results)
# Find zero-cost experiments
zero_cost = [r for r in all_results if r.cost_dollars < 0.001]
low_cost = [r for r in all_results if 0.001 <= r.cost_dollars < 0.01]
print("\n" + "="*60)
print("KEY FINDINGS")
print("="*60)
print(f"\nTotal rows processed: {total_rows}")
print(f"Total cost: ${total_cost:.4f}")
print(f"Total time: {total_time:.1f}s")
print(f"Average cost per row: ${total_cost/total_rows:.5f}")
print(f"\nExperiments with zero/near-zero cost: {len(zero_cost)}")
for r in zero_cost:
print(f" - {r.name}")
print(f"\nCost Optimization Strategies:")
print(f" 1. Use merge_on parameters when you know the columns")
print(f" 2. Clean data for fuzzy matching (typos are free to resolve)")
print(f" 3. Provide context in task description for semantic matches")
print(f" 4. LLM costs scale with semantic matches, not total rows")
Conclusion¶
The everyrow.merge() operation uses a cost-optimized cascade that makes intelligent merging surprisingly affordable:
Exact and fuzzy matches are free - typos, case differences, and minor variations don't cost anything
Only semantic matches incur costs - the LLM only processes rows that truly need reasoning (subsidiaries, acquisitions, regional names)
Providing
merge_onhints reduces costs - when you know which columns to match, specify themCosts scale with complexity, not size - a 1000-row dataset with clean data costs less than a 100-row dataset requiring semantic reasoning
For most real-world use cases, the majority of matches fall into the free tiers, making intelligent merging practical even for large datasets.
unicorn_companies = pd.read_csv("~/Downloads/unicorn_companies.csv")
investment_vcs = pd.read_csv("/Users/peter/Downloads/investments_VC.csv")
merged_unicorns = pd.merge(
unicorn_companies,
investment_vcs,
left_on="Company",
right_on="name",
how="inner",
suffixes=("_unicorn", "_vc")
)
result_exact, stats_exact = await measure_merge(
name="crunchbase merge",
task="Match companies by (company) name.",
left_table=unicorn_companies.iloc[:5000],
right_table=investment_vcs.iloc[:5000],
)
result_exact[["company", "sector", "company_name", "revenue_billions"]].head(5)
df = pd.read_csv("/Users/peter/Downloads/fda_product_recalls.csv")
df[["recalling_firm_name", "product_type", "distribution_pattern", ""]]
from pandas.core.frame import DataFrame
df_2021: DataFrame = df[df['center_classification_date'] >= pd.Timestamp('2021-08-01')] # type: ignore
df_2021.head()
df_2021.tail()
df_2021.shape
from everyrow.ops import screen
async with create_session(name="FDA Recall Screening") as session:
print(f"Session URL: {session.get_url()}")
await screen(
session=session,
task="Find recalls of products that I might have used for my child born on 2021-08-01.",
input=df_2021,
)
import numpy as np
import pandas as pd
correct_df = pd.read_csv("/Users/peter/Downloads/merge_websites_correct_output_2246.csv")
def get_correct_website_for_name(name: str) -> str:
return correct_df[correct_df["name"] == name]["personal_website_url"].values[0]
async with create_session(name="Website Matching (n=100)") as session:
print(f"Session URL: {session.get_url()}")
result = await merge(
session=session,
task="Match each person to their website(s).",
left_table=pd.read_csv("/Users/peter/Downloads/merge_websites_input_left_100.csv"),
right_table=pd.read_csv("/Users/peter/Downloads/merge_websites_input_right_100.csv"),
)
print("num of matched rows:", len(result.data))
num_of_llm_matches = sum([1 if r["personal_website_url"] == 'This row was matched due to the information in both tables' else 0 for r in result.data.research])
num_of_web_search_matches = sum([1 if r["personal_website_url"].startswith('This row was matched due to the following information found in the web:') else 0 for r in result.data.research])
fraction_of_correct_matches = np.mean([1 if url == get_correct_website_for_name(name) else 0 for name, url in zip(result.data.name, result.data.personal_website_url)])
print("num of LLM matches:", num_of_llm_matches)
print("num of web search matches:", num_of_web_search_matches)
print("fraction of correct matches:", fraction_of_correct_matches)
[1 if r["personal_website_url"] == 'This row was matched due to the information in both tables' else 0 for r in result.research]
for n in [200, 400, 800, 1600, 2246]:
async with create_session(name=f"Website Matching (n={n})") as session:
print(f"Session URL: {session.get_url()}")
result = await merge(
session=session,
task="Match each person to their website(s).",
left_table=pd.read_csv(f"/Users/peter/Downloads/merge_websites_input_left_{n}.csv"),
right_table=pd.read_csv(f"/Users/peter/Downloads/merge_websites_input_right_{n}.csv"),
)
print(f"n={n}")
print("num of matched rows:", len(result.data))
num_of_llm_matches = sum([1 if r["personal_website_url"] == 'This row was matched due to the information in both tables' else 0 for r in result.data.research])
num_of_web_search_matches = sum([1 if r["personal_website_url"].startswith('This row was matched due to the following information found in the web:') else 0 for r in result.data.research])
fraction_of_correct_matches = np.mean([1 if url == get_correct_website_for_name(name) else 0 for name, url in zip(result.data.name, result.data.personal_website_url)])
print("num of LLM matches:", num_of_llm_matches)
print("num of web search matches:", num_of_web_search_matches)
print("fraction of correct matches:", fraction_of_correct_matches)
print("-"*100)
print()
import json
result = pd.read_csv("/Users/peter/Downloads/merge_websites_output_800.csv")
result.research = [json.loads(r) for r in result.research]
print(f"n=800")
print("num of matched rows:", len(result))
num_of_llm_matches = sum([1 if r["personal_website_url"] == 'This row was matched due to the information in both tables' else 0 for r in result.research])
num_of_web_search_matches = sum([1 if r["personal_website_url"].startswith('This row was matched due to the following information found in the web:') else 0 for r in result.research])
fraction_of_correct_matches = np.mean([1 if url == get_correct_website_for_name(name) else 0 for name, url in zip(result.name, result.personal_website_url)])
print("num of LLM matches:", num_of_llm_matches)
print("num of web search matches:", num_of_web_search_matches)
print("fraction of correct matches:", fraction_of_correct_matches)
print("-"*100)
print()
result = pd.read_csv("/Users/peter/Downloads/merge_websites_output_1600.csv")
result.research = [json.loads(r) for r in result.research]
print(f"n=1600")
print("num of matched rows:", len(result))
num_of_llm_matches = sum([1 if r["personal_website_url"] == 'This row was matched due to the information in both tables' else 0 for r in result.research])
num_of_web_search_matches = sum([1 if r["personal_website_url"].startswith('This row was matched due to the following information found in the web:') else 0 for r in result.research])
fraction_of_correct_matches = np.mean([1 if url == get_correct_website_for_name(name) else 0 for name, url in zip(result.name, result.personal_website_url)])
print("num of LLM matches:", num_of_llm_matches)
print("num of web search matches:", num_of_web_search_matches)
print("fraction of correct matches:", fraction_of_correct_matches)
print("-"*100)
print()
result = pd.read_csv("/Users/peter/Downloads/merge_websites_output_2246.csv")
result.research = [json.loads(r) for r in result.research]
print(f"n=2246")
print("num of matched rows:", len(result))
num_of_llm_matches = sum([1 if r["personal_website_url"] == 'This row was matched due to the information in both tables' else 0 for r in result.research])
num_of_web_search_matches = sum([1 if r["personal_website_url"].startswith('This row was matched due to the following information found in the web:') else 0 for r in result.research])
fraction_of_correct_matches = np.mean([1 if url == get_correct_website_for_name(name) else 0 for name, url in zip(result.name, result.personal_website_url)])
print("num of LLM matches:", num_of_llm_matches)
print("num of web search matches:", num_of_web_search_matches)
print("fraction of correct matches:", fraction_of_correct_matches)
print("-"*100)
print()
import asyncio
async def run_merge_and_report(n):
async with create_session(name=f"Website Matching (n={n})") as session:
print(f"Session URL: {session.get_url()}")
result = await merge(
session=session,
task="Match each person to their website(s).",
left_table=pd.read_csv(f"/Users/peter/Downloads/merge_websites_input_left_{n}.csv"),
right_table=pd.read_csv(f"/Users/peter/Downloads/merge_websites_input_right_{n}.csv"),
)
print(f"n={n}")
print("num of matched rows:", len(result.data))
num_of_llm_matches = sum([1 if r["personal_website_url"] == 'This row was matched due to the information in both tables' else 0 for r in result.data.research])
num_of_web_search_matches = sum([1 if r["personal_website_url"].startswith('This row was matched due to the following information found in the web:') else 0 for r in result.data.research])
fraction_of_correct_matches = np.mean([1 if url == get_correct_website_for_name(name) else 0 for name, url in zip(result.data.name, result.data.personal_website_url)])
print("num of LLM matches:", num_of_llm_matches)
print("num of web search matches:", num_of_web_search_matches)
print("fraction of correct matches:", fraction_of_correct_matches)
print("-"*100)
print()
await asyncio.gather(*(run_merge_and_report(n) for n in [1600, 2246]))