File Deduplicator

The Problem

Grouping Files by Hash

i
fn group_by_hash(
  files: List(#(String, String)),
) -> dict.Dict(String, List(String)) {
  list.fold(files, dict.new(), fn(acc, file) {
    let #(path, content) = file
    let hash = hash_content(content)
    let existing = dict.get(acc, hash) |> result.unwrap([])
    dict.insert(acc, hash, [path, ..existing])
  })
}

Finding Duplicates

Once the files are grouped, finding duplicates is a filter:

i
pub fn find_duplicates(
  files: List(#(String, String)),
) -> List(List(String)) {
  files
  |> group_by_hash
  |> dict.values
  |> list.filter(fn(paths) { list.length(paths) > 1 })
}

Reporting

i
fn report_duplicates(groups: List(List(String))) -> String {
  case groups {
    [] -> "no duplicates found"
    _ -> {
      let count = groups |> list.length |> int.to_string
      count <> " duplicate group(s) found"
    }
  }
}

Tracking File Size

i
type FileInfo {
  FileInfo(path: String, hash: String, size: Int)
}
i
fn find_duplicates_with_size(
  files: List(FileInfo),
) -> #(List(List(String)), Int) {
  let grouped =
    list.fold(files, dict.new(), fn(acc, file) {
      let existing = dict.get(acc, file.hash)
      let paths = case existing {
        Ok(#(existing_paths, _)) -> [file.path, ..existing_paths]
        Error(_) -> [file.path]
      }
      dict.insert(acc, file.hash, #(paths, file.size))
    })

  let dup_groups =
    grouped
    |> dict.values
    |> list.filter(fn(entry) {
      let #(paths, _size) = entry
      list.length(paths) > 1
    })

  let dup_paths = list.map(dup_groups, fn(entry) {
    let #(paths, _size) = entry
    paths
  })

  let savings =
    dup_groups
    |> list.fold(0, fn(acc, entry) {
      let #(paths, size) = entry
      let copies = list.length(paths) - 1
      acc + size * copies
    })

  #(dup_paths, savings)
}

Walking the Filesystem

i
fn get_root() -> String {
  case argv.load().arguments {
    [path, ..] -> path
    [] ->
      simplifile.current_directory()
      |> result.unwrap(".")
  }
}
i
/// Recursively collect all regular files under `root`.
/// Returns a list of #(path, size) pairs.
/// Entries that cannot be read are silently skipped.
fn collect_files(root: String) -> List(#(String, Int)) {
  case simplifile.read_directory(at: root) {
    Error(_) -> []
    Ok(names) ->
      list.flat_map(names, fn(name) {
        let path = filepath.join(root, name)
        case simplifile.file_info(path) {
          Error(_) -> []
          Ok(info) ->
            case simplifile.file_info_type(info) {
              simplifile.File -> [#(path, info.size)]
              simplifile.Directory -> collect_files(path)
              _ -> []
            }
        }
      })
  }
}
i
/// In production, use a crypto library (e.g. gleam_crypto) to compute
/// SHA-256 of the file's bytes. Here we read the file and use its
/// content directly so the example needs no extra dependencies.
fn hash_file(path: String) -> Result(String, String) {
  simplifile.read(path)
  |> result.map_error(simplifile.describe_error)
}
i
fn run(root: String) -> String {
  let entries =
    collect_files(root)
    |> list.flat_map(fn(entry) {
      let #(path, size) = entry
      case hash_file(path) {
        Ok(hash) -> [#(path, hash, size)]
        Error(_) -> []
      }
    })

  let #(groups, savings) = find_duplicates_with_size(entries)
  case groups {
    [] -> "no duplicates found"
    _ ->
      int.to_string(list.length(groups))
      <> " duplicate group(s); "
      <> int.to_string(savings)
      <> " bytes could be freed"
  }
}

Testing

i
pub fn no_duplicates_test() {
  let files = [
    #("/a.txt", "aaa"),
    #("/b.txt", "bbb"),
    #("/c.txt", "ccc"),
  ]
  find_duplicates(files)
  |> should.equal([])
}

pub fn one_group_test() {
  let files = [
    #("/a.txt", "abc"),
    #("/b.txt", "def"),
    #("/c.txt", "abc"),
  ]
  let groups = find_duplicates(files)
  list.length(groups)
  |> should.equal(1)
}

pub fn three_copies_test() {
  let files = [
    #("/a.txt", "xyz"),
    #("/b.txt", "xyz"),
    #("/c.txt", "xyz"),
  ]
  let groups = find_duplicates(files)
  list.length(groups)
  |> should.equal(1)
  groups
  |> list.first
  |> should.be_ok
  |> list.length
  |> should.equal(3)
}

Check Understanding

Why does dict.get return Result instead of a default value?

dict.get could return Nil or an empty list when a key is missing, but that would silently hide bugs: a function that returns [] when the key is absent looks exactly like a function that found an empty list. Returning Result(List(String), Nil) forces the caller to decide what to do. In this code the right default is [], so |> result.unwrap([]) makes that choice explicit rather than hiding it.

Why does group_by_hash prepend the new path with [path, ..existing] instead of appending it with list.append(existing, [path])?

Prepending with [path, ..existing] is O(1) in Gleam because lists are linked lists: it creates a new node pointing to the existing list without copying it. Appending with list.append is O(n) because it must traverse the entire existing list to attach the new element at the end. Since the order of paths within a duplicate group does not affect correctness, prepending is the right choice here.

Exercises

Most-duplicated file (10 minutes)

Given the result of find_duplicates, write a function most_duplicated that returns the group with the most copies. Its return type should be Result(List(String), Nil): Ok(group) when at least one duplicate group exists, and Error(Nil) when there are no duplicates. Write two tests: one for an empty input and one for a list with multiple groups of different sizes.

Size savings (15 minutes)

The find_duplicates_with_size function shows how to track wasted space alongside duplicate groups. If a file with hash H appears n times and each copy is s bytes, then n - 1 of those copies are redundant, wasting (n - 1) * s bytes.

Using FileInfo (path, hash, size), write find_duplicates_with_size and return #(List(List(String)), Int) as shown in the size demo. Write tests confirming the savings calculation is correct for two groups of different sizes.

Skip unreadable files (10 minutes)

Change the input to List(#(String, Result(String, String))) where the Result represents either a valid hash or a read error. Filter out Error entries before grouping. Write a test confirming that a file with an Error hash is excluded from the results.

First-block optimisation (20 minutes)

Real deduplicators avoid hashing large files in full when a quick check can rule out duplicates. The trick is to hash only the first block (e.g., the first 4 KB) of each file to get a prefix_hash, then compute the full_hash only for files that share a prefix_hash. Files that differ in their first block cannot be identical, so skipping their full hash saves time.

Simulate this by giving each file two hashes in a record: prefix_hash (hash of a short prefix) and full_hash (hash of the entire content). Group by prefix_hash first; only form full-hash groups for files whose prefix_hash groups have size > 1. Return only true duplicates (i.e., files that share the same full_hash).