A File Archiver

Type Aliases

i
-- A content address is just a hex string of the hash
abbrev Hash := String
i
-- A manifest maps each original file path to its content hash
abbrev Manifest := List (String × Hash)

abbrev vs. def for Type Aliases

Hashing Bytes

i
-- Hash a ByteArray to a 16-character hex string
def hashBytes (ba : ByteArray) : Hash :=
  let h : UInt64 := hash ba
  let hex := "0123456789abcdef".toList
  let digits := (List.range 16).map fun i =>
    let nibble := (h >>> (60 - i * 4).toUInt64) &&& 0xF
    hex[nibble.toNat]!
  String.ofList digits

Building the Manifest

i
-- Pure: deduplicate a list of (path, bytes) pairs into (Manifest, blob store)
def buildManifest (files : List (String × ByteArray)) :
    Manifest × List (Hash × ByteArray) :=
  let entries := files.map fun (path, bytes) => (path, hashBytes bytes, bytes)
  let manifest := entries.map fun (path, h, _) => (path, h)
  let blobs := entries.foldl (init := []) fun acc (_, h, bytes) =>
    if acc.any (·.1 == h) then acc else acc ++ [(h, bytes)]
  (manifest, blobs)

Snapshot: Writing Files

i
-- IO: write each unique blob to a file named <hash>.bck
def snapshot (files : List (String × ByteArray)) (archiveDir : String) :
    IO Manifest := do
  let (manifest, blobs) := buildManifest files
  for (h, bytes) in blobs do
    IO.FS.writeBinFile s!"{archiveDir}/{h}.bck" bytes
  return manifest

Restore: Reading Files Back

i
-- IO: read blobs from the archive and write them back to their original paths
def restore (manifest : Manifest) (archiveDir : String) : IO Unit := do
  for (path, h) in manifest do
    let bytes  IO.FS.readBinFile s!"{archiveDir}/{h}.bck"
    IO.FS.writeBinFile path bytes

Listing a Directory

i
-- Collect all regular files in a directory as (path, bytes) pairs
def listFiles (dir : String) : IO (List (String × ByteArray)) := do
  let entries  (dir : System.FilePath).readDir
  let mut files : List (String × ByteArray) := []
  for entry in entries do
    let info  entry.path.metadata
    if info.type == IO.FS.FileType.file then
      let bytes  IO.FS.readBinFile entry.path
      files := files ++ [(entry.path.toString, bytes)]
  return files

Archiving a Directory

i
-- Archive every regular file in srcDir to archiveDir
def snapshotDir (srcDir archiveDir : String) : IO Manifest := do
  IO.FS.createDirAll archiveDir
  let files  listFiles srcDir
  snapshot files archiveDir
i
-- Entry point: archive a directory and report results
-- Run with: lean --run archive/archive_dir.lean
def main : IO Unit := do
  let archiveDir := "archive-dir"
  IO.println s!"Scanning current directory..."
  let manifest  snapshotDir "." archiveDir
  IO.println s!"Archived {manifest.length} file(s) to {archiveDir}/"
  let blobs := (buildManifest ( listFiles ".")).2.length
  IO.println s!"Unique content blobs: {blobs}"
  IO.println "Done."

Testing the Pure Core

i
-- Tests (pure core) ---------------------------------------------------

def sampleFiles : List (String × ByteArray) := [
  ("a.txt", "hello".toUTF8),
  ("b.txt", "hello".toUTF8),   -- same content as a.txt
  ("c.txt", "world".toUTF8)
]

-- Two files with identical content should produce only two blobs
#guard (buildManifest sampleFiles).2.length == 2

-- Both a.txt and b.txt map to the same hash
#guard
  let (manifest, _) := buildManifest sampleFiles
  (manifest.find? (·.1 == "a.txt")).map (·.2) ==
  (manifest.find? (·.1 == "b.txt")).map (·.2)

The Full Picture

Running the Program

i
-- Entry point: snapshot sample files to ./archive-dir, then restore them
-- Run with: lean --run archive/archive.lean
def main : IO Unit := do
  let archiveDir := "archive-dir"
  -- create the archive directory (ok if it already exists)
  IO.FS.createDirAll archiveDir
  IO.println s!"Snapshotting to {archiveDir}/"
  let manifest  snapshot sampleFiles archiveDir
  IO.println s!"Created {manifest.length} manifest entries"
  IO.println "Restoring files..."
  restore manifest archiveDir
  IO.println "Done."

Compiling and Running

Exercises

Write: Total File Size

Write a pure function totalSize : Manifest × List (Hash × ByteArray) → Nat that returns the sum of all blob sizes (in bytes). Use ByteArray.size to get the size of each blob.

hint
  • Extract the blob list from the pair with .2
  • Use List.foldl with (· + ·) and an initial value of 0
  • Use ByteArray.size inside the map to convert bytes to sizes

Write: Count Duplicates

Write a pure function countDuplicates (files : List (String × ByteArray)) : Nat that returns how many files have content that also appears under another name. For the sample data, this should return 2 (both a.txt and b.txt are duplicates of each other).

hint
  • Build the manifest with buildManifest
  • Group files by hash: for each hash, count how many paths map to it
  • Sum the counts for hashes that appear more than once

Write: Manifest to String

Write manifestToString : Manifest → String that formats each entry as "path:hash" with one entry per line. Use String.intercalate "\n" to join the lines.

hint
  • Use List.map to convert each (path, hash) pair to s!"{path}:{hash}"
  • Then String.intercalate "\n" to join with newlines

Fix: Hash Digits in Wrong Order

i
-- hashBytes should produce correct hex digits, but the loop extracts
-- nibbles in the wrong order. The result is reversed. Fix it.
def hashBytes (ba : ByteArray) : String :=
  let h : UInt64 := hash ba
  let hex := "0123456789abcdef".toList
  let digits := (List.range 16).map fun i =>
    let nibble := (h >>> (i * 4).toUInt64) &&& 0xF
    hex[nibble.toNat]!
  String.ofList digits

#guard hashBytes "hello".toUTF8 == hashBytes "hello".toUTF8
#guard hashBytes "hello".toUTF8 != hashBytes "world".toUTF8
hint
  • List.range 16 generates [0, 1, …, 15], and i * 4 shifts by 0, 4, 8, … bits each iteration
  • This extracts the nibbles from least-significant to most-significant, but prints them in that same order
  • The most-significant nibble should come first in the output
  • Reverse the shift: 60 - i * 4 to start from the top nibble

Fix: Blobs Not Deduplicated

i
-- buildManifest should deduplicate blobs by hash, but it keeps every
-- blob entry instead. The blob store has duplicates. Fix the dedup check.
abbrev Hash := String

def hashBytes (ba : ByteArray) : Hash :=
  toString (hash ba)

def buildManifest (files : List (String × ByteArray)) :
    List (String × Hash) × List (Hash × ByteArray) :=
  let entries := files.map fun (path, bytes) => (path, hashBytes bytes, bytes)
  let manifest := entries.map fun (path, h, _) => (path, h)
  let blobs := entries.map fun (_, h, bytes) => (h, bytes)
  (manifest, blobs)

def sample : List (String × ByteArray) := [
  ("a.txt", "hello".toUTF8),
  ("b.txt", "hello".toUTF8),   -- same content as a.txt
  ("c.txt", "world".toUTF8)
]

#guard (buildManifest sample).2.length == 2
hint
  • entries.map copies every blob, including duplicates
  • The original buildManifest uses foldl with a check: if acc.any (·.1 == h) then acc else acc ++ [(h, bytes)]
  • Add a similar deduplication step before collecting blobs

Fix: Snapshot Missing Directory

i
-- snapshot should create the archive directory before writing blobs,
-- but it doesn't. The first file write will fail. Add the missing directory
-- creation step.
abbrev Hash := String

def snapshot (files : List (String × ByteArray)) (archiveDir : String) :
    IO Unit := do
  for (path, bytes) in files do
    let h : Hash := toString (hash bytes)
    IO.FS.writeBinFile s!"{archiveDir}/{h}.bck" bytes

#eval snapshot [("a.txt", "hello".toUTF8)] "test-archive"
hint
  • IO.FS.writeBinFile will fail if the parent directory doesn't exist
  • Add IO.FS.createDirAll archiveDir before the for loop
  • createDirAll won't fail if the directory already exists (like Python's os.makedirs(exist_ok=True))

Fix: Restore Uses Wrong Path

i
-- restore constructs the blob path wrong: it uses the file path instead
-- of the hash to read the blob. Fix the file path construction.
abbrev Hash := String
abbrev Manifest := List (String × Hash)

def restore (manifest : Manifest) (archiveDir : String) : IO Unit := do
  for (path, _h) in manifest do
    let bytes  IO.FS.readBinFile s!"{archiveDir}/{path}.bck"
    IO.FS.writeBinFile path bytes

-- To test: first run snapshot on sample data, then restore with this.
-- The bug path uses the file name instead of the hash for the blob file.
hint
  • The blob file is named archiveDir/<hash>.bck, not archiveDir/<path>.bck
  • The code uses path (the original filename) as the hash — that's wrong
  • Fix the destructuring: for (path, h) in manifest do and use h in the blob path

Write: Find File in Manifest

i
-- Look up a file path in the manifest and return its hash (if present).
-- You may use the existing definitions from archive.lean.
abbrev Hash := String
abbrev Manifest := List (String × Hash)

def findFile (manifest : Manifest) (path : String) : Option Hash :=
  none

#guard findFile [("a.txt", "abc123"), ("b.txt", "def456")] "a.txt" == some "abc123"
#guard findFile [("a.txt", "abc123")] "c.txt" == none
#guard findFile [] "x.txt" == none
hint
  • Use List.find? to search for the path in the manifest
  • The predicate is fun (p, _) => p == path
  • If found, extract the hash with .map (·.2)
  • List.find? already returns Option, so the result type matches

Write: Count Unique Blobs

i
-- Count how many unique blobs are in the blob store.
-- A blob store is a list of (hash, bytes) pairs.
abbrev Hash := String

def countUniqueBlobs (blobs : List (Hash × ByteArray)) : Nat :=
  0

def sampleBlobs : List (Hash × ByteArray) := [
  ("abc", "hello".toUTF8),
  ("def", "world".toUTF8),
  ("abc", "hello".toUTF8)   -- duplicate hash, same content
]

#guard countUniqueBlobs sampleBlobs == 2
#guard countUniqueBlobs [] == 0
#guard countUniqueBlobs [("x", "a".toUTF8)] == 1
hint
  • Extract all hashes from the blobs list, then deduplicate
  • Use List.map to get the hash list, then keep only the first occurrence of each
  • Same pattern as dbCompact from the database lesson: track seen hashes in a foldl
  • Return List.length of the deduplicated list

Write: Find Files by Hash

i
-- Find all file paths in the manifest that have the given hash.
abbrev Hash := String
abbrev Manifest := List (String × Hash)

def findByHash (manifest : Manifest) (target : Hash) : List String :=
  []

def m : Manifest := [("a.txt", "abc"), ("b.txt", "def"), ("c.txt", "abc")]

#guard findByHash m "abc" == ["a.txt", "c.txt"]
#guard findByHash m "xyz" == []
#guard findByHash [] "abc" == []
hint
  • Use List.filter to keep only entries where the hash matches
  • The predicate is fun (_, h) => h == target
  • Then use List.map (·.1) to extract just the file paths
  • Like finding all files that share the same content (deduplicated files)

Write: Diff Two Manifests

i
-- Compare two manifests. Return the paths that appear in the first
-- manifest but not the second (new or renamed files).
abbrev Hash := String
abbrev Manifest := List (String × Hash)

def diffManifests (old new : Manifest) : List String :=
  []

def oldManifest : Manifest := [("a.txt", "abc"), ("b.txt", "def")]
def newManifest : Manifest := [("a.txt", "abc"), ("c.txt", "xyz")]

#guard diffManifests oldManifest newManifest == ["b.txt"]
#guard diffManifests newManifest oldManifest == ["c.txt"]
#guard diffManifests [] [] == []
hint
  • Find paths that are in old but not in new
  • Use List.filter on the old manifest with a predicate that checks if the path exists in new
  • new.any (fun (p, _) => p == path) checks if a path exists in the new manifest
  • This is like comparing two snapshots to find renamed or deleted files

Fix: Manifest Contains Wrong Hash

i
-- buildManifest constructs the manifest, but the path and hash are mispaired.
-- The entry for each file should contain (path, hash). Fix the pairing.
abbrev Hash := String
abbrev Manifest := List (String × Hash)

def hashBytes (ba : ByteArray) : Hash :=
  let h : UInt64 := hash ba
  let hex := "0123456789abcdef".toList
  let digits := (List.range 16).map fun i =>
    let nibble := (h >>> (60 - i * 4).toUInt64) &&& 0xF
    hex[nibble.toNat]!
  String.ofList digits

def buildManifest (files : List (String × ByteArray)) :
    Manifest × List (Hash × ByteArray) :=
  let entries := files.map fun (path, bytes) => (path, hashBytes bytes, bytes)
  -- BUG: manifest pairs each path with the wrong file's hash
  -- The manifest is built by zipping paths with the *reversed* hash list
  let hashes := entries.map fun (_, h, _) => h
  let manifest := List.zip
    (entries.map fun (p, _, _) => p)
    hashes.reverse
  let blobs := entries.foldl (init := []) fun acc (_, h, bytes) =>
    if acc.any (·.1 == h) then acc else acc ++ [(h, bytes)]
  (manifest, blobs)

#guard
  let files := [("a.txt", "hello".toUTF8), ("b.txt", "world".toUTF8)]
  let (manifest, _) := buildManifest files
  (manifest.find? (·.1 == "a.txt")).map (·.2) == some (hashBytes "hello".toUTF8)

#guard
  let files := [("a.txt", "hello".toUTF8), ("b.txt", "hello".toUTF8)]
  let (manifest, _) := buildManifest files
  (manifest.find? (·.1 == "a.txt")).map (·.2) ==
  (manifest.find? (·.1 == "b.txt")).map (·.2)
hint
  • The manifest should pair each path with that file's own hash
  • The bug reverses the hash list before pairing, so a.txt gets b.txt's hash and vice versa
  • Remove .reverse so hashes stay in the same order as the paths