Data, I/O, and Testing

Quoted Fields in CSV Parsing

Run the parser with the provided CSV file. Do all rows produce the correct number of fields? Pay particular attention to rows that contain commas.

import sys


def parse_scores(filename):
    """Return a list of (name, city, score) triples from a CSV file."""
    records = []
    with open(filename) as f:
        next(f)  # skip header
        for line in f:
            parts = line.strip().split(",")
            name = parts[0]
            city = parts[1]
            score = int(parts[2])
            records.append((name, city, score))
    return records


if __name__ == "__main__":
    filename = sys.argv[1] if len(sys.argv) > 1 else "quotecsv.csv"
    for name, city, score in parse_scores(filename):
        print(f"{name} ({city}): {score}")

name,city,score
Alice,New York,95
Bob,"Los Angeles, CA",87
Carol,Chicago,92
Dave,"Austin, TX",88

Show explanation

The bug is using line.split(',') instead of the csv module, so rows that contain commas inside quoted fields are split incorrectly. Teaches why hand-rolled parsers fail on real-world data and when to use standard library tools.

Wall Time vs. Monotonic Time

Run the benchmarking function several times and examine the elapsed time values. Do any of them look unusual?

import time
from unittest.mock import patch


def measure(func):
    """Return (elapsed_seconds, return_value) for a call to func."""
    start = time.time()
    result = func()
    end = time.time()
    return end - start, result


def work():
    time.sleep(0.05)
    return "done"


if __name__ == "__main__":
    elapsed, result = measure(work)
    print(f"Normal:           {elapsed:.4f}s  result={result!r}")

    call_count = [0]
    def stepped_back():
        call_count[0] += 1
        return 1000.0 if call_count[0] == 1 else 999.7

    with patch("time.time", stepped_back):
        elapsed, result = measure(work)
    print(f"After clock step: {elapsed:.4f}s  result={result!r}")
    print("(time.monotonic() would never produce a negative duration)")

Show explanation

The bug is using time.time() without accounting for system clock adjustments, so the function reports negative durations when the clock is set back. Teaches the difference between wall time and monotonic time and when to use time.monotonic.

Naive Datetime and Daylight Saving

Call the date arithmetic function with a date near a daylight saving transition. Compare the result from the naive datetime path with the result from the timezone-aware path.

from datetime import datetime, timedelta


def add_days(start_str, days):
    """Return the wall-clock time `days` days after start_str (YYYY-MM-DD HH:MM)."""
    dt = datetime.fromisoformat(start_str)
    return dt + timedelta(days=days)


if __name__ == "__main__":

    start = "2024-03-09 08:00"
    result = add_days(start, days=1)

    print(f"Start:          {start}  (America/New_York, pre-DST)")
    print(f"+ 1 naive day:  {result}")
    print()
    print("The naive result is 2024-03-10 08:00, which looks correct.")
    print("But clocks sprang forward at 02:00, so only 23 hours elapsed.")
    print("A timezone-aware calculation using zoneinfo or pytz would show")
    print("the gap and let you choose: 23 wall-clock hours or 24 absolute hours.")
    print()

    # Show what UTC-based arithmetic reveals
    try:
        from zoneinfo import ZoneInfo
        tz = ZoneInfo("America/New_York")
        aware = datetime(2024, 3, 9, 8, 0, tzinfo=tz)
        print(f"Aware start UTC offset: {aware.utcoffset()}  ({aware.tzname()})")
        result_aware = aware + timedelta(days=1)
        print(f"Aware result UTC offset: {result_aware.utcoffset()}  ({result_aware.tzname()})")
        utc_hours = (result_aware.utctimetuple(), aware.utctimetuple())
        import calendar
        start_utc = calendar.timegm(aware.utctimetuple())
        end_utc = calendar.timegm(result_aware.utctimetuple())
        print(f"Actual elapsed seconds: {end_utc - start_utc}  "
              f"(86400 = 24h; 82800 = 23h)")
    except ImportError:
        print("Install Python 3.9+ for zoneinfo demonstration")

Show explanation

The bug is adding a timedelta to a naive datetime, so the function produces results that are off by one day around daylight saving time transitions. Teaches the difference between naive and timezone-aware datetimes.

Missing Elements in HTML Parsing

Run the script with the provided HTML file and check whether it finds all the expected elements. What happens when an element is not found?

import re
import sys


def extract_prices(html):
    """Return all prices found in the HTML as a list of floats."""
    pattern = r'<span class="price">\$([\d.]+)</span>'
    matches = re.findall(pattern, html)
    return [float(m) for m in matches]


if __name__ == "__main__":
    filename = sys.argv[1] if len(sys.argv) > 1 else "missparse.html"
    with open(filename) as f:
        html = f.read()
    prices = extract_prices(html)
    if prices:
        print(f"Prices: {prices}")
        print(f"Total:  ${sum(prices):.2f}")
    else:
        print("No prices found.")

<!DOCTYPE html>
<html>
<body>
  <div class="product">
    <h2>Widget</h2>
    <span class="product-price">$9.99</span>
  </div>
  <div class="product">
    <h2>Gadget</h2>
    <span class="product-price">$24.99</span>
  </div>
  <div class="product">
    <h2>Doohickey</h2>
    <span class="product-price">$4.99</span>
  </div>
</body>
</html>

Show explanation

The bug is that the HTML structure varies and the selector matches zero elements without raising an error, so the script fails silently on some pages. Teaches how to handle missing data in HTML parsing and use assertions to catch unexpected input.

Overly Permissive Regular Expression

Test the regular expression against a few valid email addresses and a few strings that look like email addresses but are not. Does it reject the invalid ones?

import re

EMAIL_PATTERN = r"\w+@\w+"


def extract_emails(text):
    """Return all email addresses found in text."""
    return re.findall(EMAIL_PATTERN, text)


if __name__ == "__main__":
    tests = [
        ("alice@example.com",        True),   # valid
        ("bob.smith@uni.edu",        True),   # valid (but dots in local part missed)
        ("not-an-email",             False),  # should not match
        ("foo@bar",                  False),  # no TLD — should not match
        ("user@host with spaces",    False),  # malformed — should not match
        ("x@y",                      False),  # too short — should not match
    ]
    for text, should_match in tests:
        found = extract_emails(text)
        matched = bool(found)
        status = "OK  " if matched == should_match else "FAIL"
        print(f"{status}  {text!r:35s} -> {found}")

Show explanation

The bug is a pattern that is too permissive (e.g., missing anchors or character class constraints), so the regular expression also matches invalid strings. Teaches how to test regular expressions with both valid and invalid inputs.

Test with No Assertions

Run the test suite. Does it pass? Now deliberately break the function the test is testing. Does the test still pass?

def average(numbers):
    """Return the mean of a non-empty list of numbers."""
    return sum(numbers) / (len(numbers) - 1)


def test_average_single():
    result = average([10])

def test_average_simple():
    result = average([1, 2, 3])

def test_average_known():
    result = average([2, 4, 6, 8])


if __name__ == "__main__":
    for test in [test_average_simple, test_average_known]:
        try:
            test()
            print(f"PASS {test.__name__}")   # always prints PASS
        except Exception as e:
            print(f"FAIL {test.__name__}: {e}")
    print()
    print(f"average([2, 4, 6, 8]) = {average([2, 4, 6, 8])}  (expected 5.0)")

Show explanation

The bug is that the test calls the function but never asserts anything about the result, so it always passes even when the function is broken. Teaches that a test with no assertions is not a test and how to write assertions correctly.

Shared State Between Tests

Run each test on its own. Then run both together. Do you get the same results both ways?

# Module-level registry — shared state that persists between tests
REGISTRY = {}


def register(name, value):
    REGISTRY[name] = value


def lookup(name):
    return REGISTRY.get(name)


# --- tests ---

def test_register():
    register("alpha", 1)
    assert lookup("alpha") == 1


def test_lookup_absent():
    assert lookup("absent") is None
    assert len(REGISTRY) == 0


def test_overwrite():
    register("alpha", 99)
    assert lookup("alpha") == 99


if __name__ == "__main__":
    for test in [test_register, test_lookup_absent, test_overwrite]:
        try:
            test()
            print(f"PASS {test.__name__}")
        except AssertionError as e:
            print(f"FAIL {test.__name__}: {e}")

Show explanation

The bug is that one test modifies a module-level variable that another test depends on, so the suite passes in isolation but fails when run together. Teaches test isolation, teardown, and the risks of shared global state.

Hardcoded Absolute Path

Run the script from a different working directory than the one where the script file is saved. Does it find its configuration file?

import json


def load_config():
    """Load configuration from the project config file."""
    config_path = "/Users/gvwilson/unbreak/diot/abspath.json"
    with open(config_path) as f:
        return json.load(f)


if __name__ == "__main__":
    config = load_config()
    print(f"threshold:   {config['threshold']}")
    print(f"max_retries: {config['max_retries']}")
    print(f"output_dir:  {config['output_dir']}")

{
    "threshold": 0.5,
    "max_retries": 3,
    "output_dir": "results"
}

Show explanation

The bug is using a hardcoded absolute path instead of a path relative to the script's location, so the function behaves differently on different machines. Teaches the difference between __file__-relative and working-directory-relative paths.

Unserializable Datetime in JSON

Run the script and read the error message. Which value in the data structure cannot be serialized?

import json
from datetime import datetime


def make_report(title, value):
    """Build a report dict including the current timestamp."""
    return {
        "title": title,
        "value": value,
        "generated_at": datetime.now(),
    }


if __name__ == "__main__":
    report = make_report("monthly_sales", 48291.75)
    print(f"Report dict: {report}")
    print("Serializing to JSON...")
    print(json.dumps(report))

Show explanation

The bug is that the data contains datetime objects, which are not JSON-serializable, so the program raises an error when writing output. Teaches how to identify serialization errors and write custom JSON encoders.

Wrong Logging Level

Run the script and then look at the log file. Are the messages you expected to see present?

import logging

logging.basicConfig(
    level=logging.WARNING,
    format="%(levelname)s: %(message)s",
)
logger = logging.getLogger(__name__)


def process(items):
    """Double each item, logging progress at DEBUG level."""
    logger.debug(f"Starting process() with {len(items)} items")
    results = []
    for item in items:
        logger.debug(f"  processing item {item!r}")
        results.append(item * 2)
    logger.debug(f"Finished: {len(results)} results")
    return results


if __name__ == "__main__":
    data = [1, 2, 3, 4, 5]
    output = process(data)
    print(f"Output: {output}")
    print("(no debug messages shown — logger.debug() calls are silenced by WARNING level)")
    print(f"Effective log level: {logging.getLevelName(logger.getEffectiveLevel())}")

Show explanation

The bug is that the log level is set to WARNING but the calls use logger.debug(), so the messages never appear in the log file. Teaches how Python's logging hierarchy works and how to verify the effective log level.