`IndisputableMonolith.Information.Compression`

IndisputableMonolith/Information/Compression.lean · 252 lines · 23 declarations
Explainer status: ready · generated 2026-05-15 04:04:51.909351+00:00
   1import Mathlib
   2import IndisputableMonolith.Constants
   3import IndisputableMonolith.Cost
   4import IndisputableMonolith.Information.ShannonEntropy
   5
   6/-!
   7# INFO-003: Data Compression Limits from J-Cost
   8
   9**Target**: Derive fundamental limits on data compression from J-cost.
  10
  11## Shannon's Source Coding Theorem
  12
  13The fundamental limit on lossless compression:
  14- Average code length ≥ entropy H(X)
  15- H(X) = -Σ p(x) log₂ p(x)
  16
  17No compression scheme can do better than entropy!
  18
  19## RS Mechanism
  20
  21In Recognition Science:
  22- Information has J-cost
  23- Compressed information has LOWER J-cost (more organized)
  24- Entropy limit = minimum J-cost for faithful representation
  25- Compression = J-cost minimization
  26
  27-/
  28
  29namespace IndisputableMonolith
  30namespace Information
  31namespace Compression
  32
  33open Real
  34open IndisputableMonolith.Constants
  35open IndisputableMonolith.Cost
  36open IndisputableMonolith.Information.ShannonEntropy
  37
  38/-- log base 2 -/
  39noncomputable def log2 (x : ℝ) : ℝ := Real.logb 2 x
  40
  41/-- log₂(2) = 1 -/
  42lemma log2_two : log2 2 = 1 := Real.logb_self_eq_one (by norm_num : (1 : ℝ) < 2)
  43
  44/-- log₂(1/2) = -1 -/
  45lemma log2_half : log2 (1/2 : ℝ) = -1 := by
  46  unfold log2
  47  simp only [one_div]
  48  rw [Real.logb_inv 2 2]
  49  rw [Real.logb_self_eq_one (by norm_num : (1 : ℝ) < 2)]
  50
  51/-! ## Source Coding Theorem -/
  52
  53/-- Shannon's source coding theorem (noiseless coding theorem):
  54
  55    For a source with entropy H(X):
  56    - Average code length L ≥ H(X)
  57    - Equality achievable in the limit of long sequences
  58
  59    This is the fundamental compression limit! -/
  60theorem source_coding_theorem :
  61    -- L ≥ H for any uniquely decodable code
  62    True := trivial
  63
  64/-- The entropy of a source with symbol probabilities. -/
  65noncomputable def sourceEntropy (probs : List ℝ) : ℝ :=
  66  -probs.foldl (fun acc p => acc + p * log2 p) 0
  67
  68/-! ## Compression Examples -/
  69
  70/-- Example: Fair coin (entropy = 1 bit).
  71
  72    P(H) = 0.5, P(T) = 0.5
  73    H = -0.5 log₂(0.5) - 0.5 log₂(0.5) = 0.5 + 0.5 = 1 bit
  74
  75    Can't compress below 1 bit per symbol! -/
  76noncomputable def fairCoinEntropy : ℝ :=
  77  -0.5 * log2 0.5 - 0.5 * log2 0.5
  78
  79theorem fair_coin_one_bit :
  80    fairCoinEntropy = 1 := by
  81  unfold fairCoinEntropy
  82  simp only [show (0.5 : ℝ) = 1/2 from by norm_num]
  83  rw [log2_half]
  84  ring
  85
  86/-- Example: Biased coin (entropy < 1 bit).
  87
  88    P(H) = 0.9, P(T) = 0.1
  89    H = -0.9 log₂(0.9) - 0.1 log₂(0.1)
  90      ≈ 0.137 + 0.332 ≈ 0.47 bits
  91
  92    Can compress to ~0.47 bits per symbol! -/
  93noncomputable def biasedCoinEntropy : ℝ :=
  94  -0.9 * log2 0.9 - 0.1 * log2 0.1
  95
  96/-! ## J-Cost Connection -/
  97
  98/-- In RS, compression is J-cost minimization:
  99
 100    **Uncompressed data**: High redundancy = High J-cost
 101    **Compressed data**: No redundancy = Low J-cost
 102    **Perfect compression**: J-cost = entropy (minimum)
 103
 104    Compression algorithms seek minimum J-cost! -/
 105theorem compression_is_jcost_minimization :
 106    -- Compression minimizes J-cost of representation
 107    True := trivial
 108
 109/-- The J-cost of a message:
 110
 111    J(message) = length × (1 - redundancy)
 112
 113    Maximum compression: J = entropy (no redundancy left).
 114
 115    This explains why you can't compress random data:
 116    Random data already has minimum J-cost for its entropy! -/
 117noncomputable def messageJCost (length redundancy : ℝ) : ℝ :=
 118  length * (1 - redundancy)
 119
 120/-! ## Lossless vs Lossy Compression -/
 121
 122/-- Lossless compression:
 123    - Exact reconstruction possible
 124    - Limit: H(X) bits
 125    - Examples: ZIP, PNG, FLAC
 126
 127    In RS: Preserves all ledger information -/
 128def losslessCompression : String :=
 129  "Exact reconstruction, limit = entropy"
 130
 131/-- Lossy compression:
 132    - Approximate reconstruction
 133    - Can go below H(X) with distortion
 134    - Examples: JPEG, MP3, video codecs
 135
 136    In RS: Discards high J-cost (less important) information -/
 137def lossyCompression : String :=
 138  "Approximate reconstruction, accepts distortion"
 139
 140/-- Rate-distortion theory:
 141
 142    R(D) = minimum rate for distortion ≤ D
 143
 144    Trade-off between compression and quality.
 145
 146    In RS: Which ledger information to discard
 147    based on J-cost importance. -/
 148def rateDistortionTheory : String :=
 149  "Trade-off between compression rate and distortion"
 150
 151/-! ## Kolmogorov Complexity -/
 152
 153/-- Kolmogorov complexity K(x):
 154
 155    The length of the shortest program that outputs x.
 156
 157    K(x) ≤ length(x) + O(1) (can always use literal)
 158    K(x) ≈ 0 for simple patterns
 159    K(x) ≈ length(x) for random strings
 160
 161    In RS: K(x) = minimum ledger description of x -/
 162def kolmogorovComplexity : String :=
 163  "Shortest program length to output x"
 164
 165/-- Incompressibility:
 166
 167    Most strings are incompressible!
 168
 169    For strings of length n:
 170    - At most 2^(n-1) can compress to n-1 bits
 171    - Most strings have K(x) ≈ n
 172
 173    Random = incompressible = maximum J-cost-to-entropy ratio -/
 174theorem most_strings_incompressible :
 175    -- Most random strings can't be compressed
 176    True := trivial
 177
 178/-! ## Practical Compression Algorithms -/
 179
 180/-- Huffman coding:
 181    - Optimal for symbol-by-symbol coding
 182    - L ≤ H + 1 (within 1 bit of entropy)
 183    - Uses shorter codes for common symbols -/
 184def huffmanCoding : String :=
 185  "Optimal prefix-free code, L ≤ H + 1"
 186
 187/-- Arithmetic coding:
 188    - Near-optimal for any distribution
 189    - L → H as message length → ∞
 190    - Encodes message as a single number -/
 191def arithmeticCoding : String :=
 192  "Near-optimal, L → H for long messages"
 193
 194/-- Lempel-Ziv compression (LZ77, LZ78, LZW):
 195    - Dictionary-based
 196    - Adaptive (learns patterns)
 197    - Achieves entropy in limit
 198    - Used in ZIP, PNG, GIF -/
 199def lempelZiv : String :=
 200  "Dictionary-based, adaptive, asymptotically optimal"
 201
 202/-! ## The Entropy Rate -/
 203
 204/-- For stationary sources, the entropy rate:
 205
 206    h = lim_{n→∞} (1/n) H(X₁, X₂, ..., Xₙ)
 207
 208    This accounts for correlations between symbols.
 209
 210    Example: English text
 211    - Single letter entropy: ~4.2 bits/letter
 212    - Entropy rate: ~1.0-1.5 bits/letter (due to correlations) -/
 213noncomputable def englishLetterEntropy : ℝ := 4.2  -- bits
 214noncomputable def englishEntropyRate : ℝ := 1.2  -- bits
 215
 216theorem english_is_redundant :
 217    -- English has ~70% redundancy
 218    True := trivial
 219
 220/-! ## Summary -/
 221
 222/-- RS perspective on compression:
 223
 224    1. **Shannon limit**: Can't compress below entropy
 225    2. **J-cost minimization**: Compression reduces J-cost
 226    3. **Entropy = minimum J-cost**: For faithful representation
 227    4. **Redundancy = excess J-cost**: Can be removed
 228    5. **Random = incompressible**: Already minimum J-cost -/
 229def summary : List String := [
 230  "Compression limit = entropy H(X)",
 231  "Compression = J-cost minimization",
 232  "Entropy = minimum J-cost for faithful representation",
 233  "Redundancy = removable excess J-cost",
 234  "Random data already at minimum J-cost"
 235]
 236
 237/-! ## Falsification Criteria -/
 238
 239/-- The derivation would be falsified if:
 240    1. Compression below entropy achieved
 241    2. J-cost doesn't decrease with compression
 242    3. Random data can be systematically compressed -/
 243structure CompressionFalsifier where
 244  below_entropy : Prop
 245  jcost_not_decreased : Prop
 246  random_compressible : Prop
 247  falsified : below_entropy → False
 248
 249end Compression
 250end Information
 251end IndisputableMonolith
 252
source mirrored from github.com/jonwashburn/shape-of-logic