Dataframes in Gleam

What Is a Dataframe?

Column Types

i
pub type Column {
    IntCol(List(Int))
    StrCol(List(String))
}

Building a Dataframe

i
pub type Dataframe {
    Dataframe(cols: dict.Dict(String, Column), nrows: Int)
}
i
pub fn make(pairs: List(#(String, Column))) -> Result(Dataframe, String) {
    case pairs {
        [] -> Ok(Dataframe(cols: dict.new(), nrows: 0))
        [#(_, first), ..] -> {
            let n = col_length(first)
            let bad = list.find(pairs, fn(p) {
                let #(_, col) = p
                col_length(col) != n
            })
            case bad {
                Ok(#(name, _)) ->
                    Error("column '" <> name <> "' has wrong length")
                Error(_) ->
                    Ok(Dataframe(cols: dict.from_list(pairs), nrows: n))
            }
        }
    }
}

Accessing Columns

i
pub fn nrows(df: Dataframe) -> Int {
    df.nrows
}

pub fn ncols(df: Dataframe) -> Int {
    dict.size(df.cols)
}

pub fn int_col(df: Dataframe, name: String) -> Result(List(Int), String) {
    case dict.get(df.cols, name) {
        Error(_) -> Error("no column '" <> name <> "'")
        Ok(StrCol(_)) -> Error("column '" <> name <> "' is not integer")
        Ok(IntCol(xs)) -> Ok(xs)
    }
}

pub fn str_col(df: Dataframe, name: String) -> Result(List(String), String) {
    case dict.get(df.cols, name) {
        Error(_) -> Error("no column '" <> name <> "'")
        Ok(IntCol(_)) -> Error("column '" <> name <> "' is not string")
        Ok(StrCol(xs)) -> Ok(xs)
    }
}

Selecting a Subset of Columns

i
pub fn select(df: Dataframe, names: List(String)) -> Result(Dataframe, String) {
    list.fold(names, Ok([]), fn(acc_result, name) {
        case acc_result {
            Error(_) -> acc_result
            Ok(acc) ->
                case dict.get(df.cols, name) {
                    Error(_) -> Error("no column '" <> name <> "'")
                    Ok(col) -> Ok([#(name, col), ..acc])
                }
        }
    })
    |> result.map(fn(pairs) {
        Dataframe(
            cols: dict.from_list(list.reverse(pairs)),
            nrows: df.nrows,
        )
    })
}

Aggregation and Filtering

i
pub fn col_sum(df: Dataframe, name: String) -> Result(Int, String) {
    int_col(df, name)
    |> result.map(fn(xs) { list.fold(xs, 0, fn(acc, x) { acc + x }) })
}
i
pub fn filter_rows(
    df: Dataframe,
    name: String,
    pred: fn(Int) -> Bool,
) -> Result(Dataframe, String) {
    use xs <- result.try(int_col(df, name))
    let mask = list.map(xs, pred)
    let new_cols =
        dict.to_list(df.cols)
        |> list.map(fn(pair) {
            let #(n, col) = pair
            #(n, keep_by_mask(col, mask))
        })
        |> dict.from_list
    let new_nrows = list.length(list.filter(mask, fn(b) { b }))
    Ok(Dataframe(cols: new_cols, nrows: new_nrows))
}

fn keep_by_mask(col: Column, mask: List(Bool)) -> Column {
    case col {
        IntCol(xs) -> IntCol(keep_where(xs, mask))
        StrCol(xs) -> StrCol(keep_where(xs, mask))
    }
}

fn keep_where(values: List(a), mask: List(Bool)) -> List(a) {
    list.zip(values, mask)
    |> list.fold([], fn(acc, pair) {
        case pair {
            #(v, True) -> [v, ..acc]
            _ -> acc
        }
    })
    |> list.reverse
}

Running the Example

i
    let data = [
        #("name", StrCol(["Alice", "Bob", "Carol"])),
        #("age", IntCol([30, 25, 35])),
        #("score", IntCol([88, 92, 79])),
    ]
    case make(data) {
        Error(msg) -> io.println("error: " <> msg)
        Ok(df) -> {
            io.println("nrows=" <> string.inspect(nrows(df)))
            io.println("ncols=" <> string.inspect(ncols(df)))
            io.println("total score=" <> string.inspect(col_sum(df, "score")))
            case filter_rows(df, "age", fn(age) { age >= 30 }) {
                Error(msg) -> io.println("filter error: " <> msg)
                Ok(seniors) -> {
                    io.println(
                        "age >= 30: "
                        <> string.inspect(nrows(seniors))
                        <> " rows",
                    )
                    io.println(
                        "names: " <> string.inspect(str_col(seniors, "name")),
                    )
                }
            }
        }
    }

    io.println(
        "bad lengths: "
        <> string.inspect(make([
            #("x", IntCol([1, 2, 3])),
            #("y", IntCol([4, 5])),
        ])),
    )

FIXME: explain how to filter rows that are strings - is a separate function needed?

FIXME: can dataframe operations be chained with |> ?

Testing

i
pub fn make_valid_test() {
    make([#("x", IntCol([1, 2, 3])), #("y", StrCol(["a", "b", "c"]))])
    |> should.be_ok()
}

pub fn make_length_mismatch_test() {
    make([#("x", IntCol([1, 2, 3])), #("y", IntCol([4, 5]))])
    |> should.be_error()
}

pub fn make_empty_test() {
    make([])
    |> should.be_ok()
}

pub fn nrows_test() {
    let df = make([#("x", IntCol([1, 2, 3]))]) |> should.be_ok()
    nrows(df) |> should.equal(3)
}

pub fn ncols_test() {
    let df =
        make([#("a", IntCol([1])), #("b", StrCol(["x"]))]) |> should.be_ok()
    ncols(df) |> should.equal(2)
}

pub fn int_col_exists_test() {
    let df = make([#("n", IntCol([10, 20]))]) |> should.be_ok()
    int_col(df, "n") |> should.equal(Ok([10, 20]))
}

pub fn int_col_missing_test() {
    let df = make([#("n", IntCol([1]))]) |> should.be_ok()
    int_col(df, "z") |> should.be_error()
    Nil
}

pub fn col_sum_test() {
    let df = make([#("v", IntCol([1, 2, 3, 4]))]) |> should.be_ok()
    col_sum(df, "v") |> should.equal(Ok(10))
}

pub fn select_keeps_named_cols_test() {
    let df =
        make([
            #("a", IntCol([1, 2])),
            #("b", StrCol(["x", "y"])),
            #("c", IntCol([3, 4])),
        ])
        |> should.be_ok()
    let sub = select(df, ["a", "c"]) |> should.be_ok()
    ncols(sub) |> should.equal(2)
}

pub fn select_missing_col_test() {
    let df = make([#("a", IntCol([1]))]) |> should.be_ok()
    select(df, ["a", "z"]) |> should.be_error()
    Nil
}

pub fn filter_rows_test() {
    let df =
        make([
            #("name", StrCol(["Alice", "Bob", "Carol"])),
            #("age", IntCol([30, 25, 35])),
        ])
        |> should.be_ok()
    let filtered =
        filter_rows(df, "age", fn(age) { age >= 30 }) |> should.be_ok()
    nrows(filtered) |> should.equal(2)
    str_col(filtered, "name") |> should.equal(Ok(["Alice", "Carol"]))
}

Check Understanding

Why does make store nrows in the Dataframe record rather than computing it from a column each time it is needed?

Accessing the length of a list is O(n) in Gleam because lists are singly-linked: every call to list.length walks the whole list. Storing the row count once avoids this cost for every subsequent nrows call and for operations like filter_rows that need the count after building the new column dictionary. The trade-off is that nrows must be updated correctly in every function that changes the shape of the dataframe.

What happens if you call filter_rows with a column name that holds strings?

filter_rows calls int_col(df, name) first. int_col pattern-matches on the column variant: if the named column is StrCol(_), it returns Error("column '...' is not integer"). filter_rows uses result.try, so it propagates that error immediately without ever applying the predicate or building a mask. The caller gets an Error and no filtering is performed.

Exercises

Float column (15 minutes)

Add FloatCol(List(Float)) to the Column type. Add float_col(df, name) -> Result(List(Float), String) and col_mean(df, name) -> Result(Float, String) that computes the column mean. Update make, keep_by_mask, and any other functions that pattern-match on Column. Write at least three tests.

Group by (20 minutes)

Write group_by(df: Dataframe, name: String) -> Result(Dict(String, Dataframe), String) that partitions rows by the distinct string values in the named column. Each key in the result is one distinct string value; the associated dataframe contains only the rows where that column has that value. Use filter_rows internally. Test with at least two distinct groups.

Add column (10 minutes)

Write add_col(df: Dataframe, name: String, col: Column) -> Result(Dataframe, String) that returns a new dataframe with the given column appended. Return Error if the column length does not match nrows(df) or if a column with that name already exists. Write three tests: one success, one length mismatch, one duplicate name.

Row at index (15 minutes)

Write row(df: Dataframe, idx: Int) -> Result(Dict(String, String), String) that returns all column values for a given row index as a dict mapping column name to its string representation (use int.to_string for integer columns). Return Error if idx is negative or out of range. Test with a valid index, a negative index, and an index equal to nrows.