docs: update readme.md to include dict builder

zleyyij · zleyyij · commit a710b220422b · 2025-08-19T11:55:26.000-06:00
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 **/*.rs.bk
 Cargo.lock
 /local_corpus_files
+/local_dict_corpus_files
 /orig-zstd
 fuzz_decodecorpus
 perf.data*
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,7 +15,7 @@ categories = ["compression"]
 [dependencies]
 twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"], optional = true }
 
-# Internal feature, only used when building as part of libstd, not part of the
+# Internal feature, only used when building as part of libstd, not part of theea
 # stable interface of this crate.
 compiler_builtins = { version = "0.1.2", optional = true }
 core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
@@ -33,6 +33,7 @@ default = ["hash", "std"]
 hash = ["dep:twox-hash"]
 fuzz_exports = []
 std = []
+dict_builder = ["std"]
 
 # Internal feature, only used when building as part of libstd, not part of the
 # stable interface of this crate.
@@ -52,4 +53,4 @@ required-features = ["std"]
 
 [[bin]]
 name = "zstd_dict"
-required-features = ["std"]
+required-features = ["std", "dict_builder"]
diff --git a/Readme.md b/Readme.md
@@ -15,22 +15,49 @@ This crate is currently actively maintained.
 
 # Current Status
 
-Feature complete on the decoder side.
+## Decompression
+The `decoding` module provides a complete
+implementation of a Zstandard decompressor.
+
+In terms of speed, `ruzstd` is behind the original C implementation
+which has a rust binding located [here](https://github.com/gyscos/zstd-rs).
 
+Measuring with the 'time' utility the original zstd and my decoder both
+decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5
+times slower. Enwik9 is highly compressible, for less compressible data
+(like a ubuntu installation .iso) my decoder comes close to only being
+1.4 times slower.
+
+## Compression
 On the compression side:
 - Support for generating compressed blocks at any compression level
   - [x] Uncompressed
   - [x] Fastest (roughly level 1)
   - [ ] Default (roughly level 3)
   - [ ] Better (roughly level 7)
   - [ ] Best (roughly level 11)
-- [ ] Checksums
+- [x] Checksums
 - [ ] Dictionaries
 
-## Speed
-In terms of speed this library is behind the original C implementation which has a rust binding located [here](https://github.com/gyscos/zstd-rs).
+## Dictionary Generation
+When the `dict_builder` feature is enabled, the `dictionary` module
+provides the ability to create new dictionaries. 
+
+On the `github-users` sample set, our implementation benchmarks within
+0.2% of the official implementation (as of commit 
+`09e52d07340acdb2e13817b066e8be6e424f7258`):
+```
+uncompressed: 100.00% (7484607 bytes)
+no dict: 34.99% of original size (2618872 bytes)
+reference dict: 16.16% of no dict size (2195672 bytes smaller)
+our dict: 16.28% of no dict size (2192400 bytes smaller)
+```
+
+The dictionary generator only provides support for creating "raw
+content" dictionaries. Tagged dictionaries are currently unsupported.
 
-Measuring with the 'time' utility the original zstd and my decoder both decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5 times slower. Enwik9 is highly compressible, for less compressible data (like a ubuntu installation .iso) my decoder comes close to only being 1.4 times slower.
+See <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format>
+for clarification.
 
 
 # How can you use it?
diff --git a/src/bin/zstd.rs b/src/bin/zstd.rs
@@ -34,9 +34,7 @@ fn decompress(flags: &[String], file_paths: &[String]) {
     }
 
     if flags.len() != 2 {
-        eprintln!(
-            "No flags other than -d and -c are currently implemented. Flags used: {flags:?}"
-        );
+        eprintln!("No flags other than -d and -c are currently implemented. Flags used: {flags:?}");
         return;
     }
 
@@ -156,40 +154,40 @@ fn main() {
     let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect();
     let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect();
     file_paths.remove(0);
-    
+
     if flags.is_empty() {
-       let mut encoder = FrameCompressor::new(CompressionLevel::Fastest);
-       encoder.set_drain(Vec::new());
-    
-       for path in file_paths {
-           let start_instant = Instant::now();
-           let file = std::fs::File::open(&path).unwrap();
-           let input_len = file.metadata().unwrap().len() as usize;
-           let file = PercentPrintReader {
-               reader: BufReader::new(file),
-               total: input_len,
-               counter: 0,
-               last_percent: 0,
-           };
-           encoder.set_source(file);
-           encoder.compress();
-           let mut output: Vec<_> = encoder.take_drain().unwrap();
-           println!(
-               "Compressed {path:} from {} to {} ({}%) took {}ms",
-               input_len,
-               output.len(),
-               if input_len == 0 {
-                   0
-               } else {
-                   output.len() * 100 / input_len
-               },
-               start_instant.elapsed().as_millis()
-           );
-           output.clear();
-           encoder.set_drain(output);
-       }
+        let mut encoder = FrameCompressor::new(CompressionLevel::Fastest);
+        encoder.set_drain(Vec::new());
+
+        for path in file_paths {
+            let start_instant = Instant::now();
+            let file = std::fs::File::open(&path).unwrap();
+            let input_len = file.metadata().unwrap().len() as usize;
+            let file = PercentPrintReader {
+                reader: BufReader::new(file),
+                total: input_len,
+                counter: 0,
+                last_percent: 0,
+            };
+            encoder.set_source(file);
+            encoder.compress();
+            let mut output: Vec<_> = encoder.take_drain().unwrap();
+            println!(
+                "Compressed {path:} from {} to {} ({}%) took {}ms",
+                input_len,
+                output.len(),
+                if input_len == 0 {
+                    0
+                } else {
+                    output.len() * 100 / input_len
+                },
+                start_instant.elapsed().as_millis()
+            );
+            output.clear();
+            encoder.set_drain(output);
+        }
     } else {
-       decompress(&flags, &file_paths);
+        decompress(&flags, &file_paths);
     }
 }
 
diff --git a/src/bin/zstd_dict.rs b/src/bin/zstd_dict.rs
@@ -1,7 +1,7 @@
 use ruzstd::dictionary::{create_raw_dict_from_dir, create_raw_dict_from_source};
+use std::env::args;
 use std::fs::File;
 use std::path::Path;
-use std::env::args;
 
 fn main() {
     let args: Vec<String> = args().collect();
diff --git a/src/bin/zstd_stream.rs b/src/bin/zstd_stream.rs
@@ -18,9 +18,7 @@ fn main() {
     }
 
     if flags.len() != 2 {
-        eprintln!(
-            "No flags other than -d and -c are currently implemented. Flags used: {flags:?}"
-        );
+        eprintln!("No flags other than -d and -c are currently implemented. Flags used: {flags:?}");
         return;
     }
 
diff --git a/src/dictionary/cover.rs b/src/dictionary/cover.rs
@@ -2,10 +2,10 @@
 //! described in the paper "Effective Construction of Relative Lempel-Ziv Dictionaries",
 //! by Liao, Petri, Moffat, and Wirth, published under the University of Melbourne.
 //!
-//! See: https://people.eng.unimelb.edu.au/ammoffat/abstracts/lpmw16www.pdf
+//! See: <https://people.eng.unimelb.edu.au/ammoffat/abstracts/lpmw16www.pdf>
 //!
 //! Facebook's implementation was also used as a reference.
-//! https://github.com/facebook/zstd/tree/dev/lib/dictBuilder
+//! <https://github.com/facebook/zstd/tree/dev/lib/dictBuilder>
 
 use super::DictParams;
 use crate::dictionary::frequency::estimate_frequency;
@@ -43,11 +43,7 @@ impl PartialEq for Segment {
 
 impl PartialOrd for Segment {
     fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
-        match self.score.partial_cmp(&other.score) {
-            Some(core::cmp::Ordering::Equal) => {}
-            ord => return ord,
-        }
-        self.score.partial_cmp(&other.score)
+        Some(self.cmp(other))
     }
 }
 
@@ -68,15 +64,15 @@ pub struct Context {
 
 /// Returns the highest scoring segment in an epoch
 /// as a slice of that epoch.
-pub fn pick_best_segment<'epoch>(
+pub fn pick_best_segment(
     params: &DictParams,
     ctx: &mut Context,
-    collection_sample: &'epoch [u8],
+    collection_sample: &'_ [u8],
 ) -> Segment {
     let mut segments = collection_sample
         .chunks(params.segment_size as usize)
         .peekable();
-    let mut best_segment: &[u8] = &segments.peek().expect("at least one segment");
+    let mut best_segment: &[u8] = segments.peek().expect("at least one segment");
     let mut top_segment_score: usize = 0;
     // Iterate over segments and score each segment, keeping track of the best segment
     for segment in segments {
@@ -107,7 +103,7 @@ fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) ->
         if ctx.frequencies.contains_key(kmer) {
             continue;
         }
-        let kmer_score = estimate_frequency(kmer, &collection_sample);
+        let kmer_score = estimate_frequency(kmer, collection_sample);
         ctx.frequencies.insert(*kmer, kmer_score);
         segment_score += kmer_score;
     }
diff --git a/src/dictionary/mod.rs b/src/dictionary/mod.rs
@@ -50,8 +50,8 @@ pub struct DictParams {
     /// As found under "4. Experiments - Varying Segment Size" in the original paper, a
     /// segment size of 2 kiB was effective.
     ///
-    /// "We explored a range of [segment_size] values and found the performance of LMC is insensitive
-    /// to [segment_size]. We fix [segment_size] to 2kiB
+    /// "We explored a range of \[`segment_size`\] values and found the performance of LMC is insensitive
+    /// to \[`segment_size`\]. We fix \[`segment_size`\] to 2kiB
     ///
     /// Reasonable range: [16, 2048+]
     pub segment_size: u32,
@@ -85,7 +85,7 @@ pub fn create_raw_dict_from_dir<P: AsRef<Path>, W: io::Write>(
         for entry in dir {
             let entry = entry?;
             if entry.file_type()?.is_dir() {
-                recurse_read(fs::read_dir(&entry.path())?, file_paths)?;
+                recurse_read(fs::read_dir(entry.path())?, file_paths)?;
             } else {
                 file_paths.push(entry.path());
             }
diff --git a/src/dictionary/reservoir.rs b/src/dictionary/reservoir.rs
@@ -2,7 +2,7 @@ use super::cover::K;
 use alloc::vec::Vec;
 use core::f64::consts::E;
 use fastrand;
-use std::io;
+use std::{io, vec};
 
 /// Creates a representative sample of `input` of `size` bytes.
 pub fn create_sample<R: io::Read>(input: &mut R, size: usize) -> Vec<u8> {
@@ -31,8 +31,7 @@ impl Reservoir {
     /// Initialize a new empty reservoir, creating an allocation of `size`.
     pub fn new(size: usize) -> Self {
         assert!(size >= 16, "Reservoirs cannot be below 16 bytes in size");
-        let mut lake = Vec::with_capacity(size);
-        lake.resize(size, 0);
+        let lake: Vec<u8> = vec![0; size];
         let k = K as u16;
         Self { lake, k }
     }
diff --git a/src/lib.rs b/src/lib.rs
@@ -35,6 +35,7 @@ macro_rules! vprintln {
 mod bit_io;
 mod common;
 pub mod decoding;
+#[cfg(feature = "dict_builder")]
 pub mod dictionary;
 pub mod encoding;
 
diff --git a/src/tests/decode_corpus.rs b/src/tests/decode_corpus.rs
@@ -7,6 +7,7 @@ fn test_decode_corpus_files() {
     use alloc::string::{String, ToString};
     use alloc::vec::Vec;
     use std::fs;
+    use std::io::BufReader;
     use std::io::Read;
     use std::println;
 
@@ -82,7 +83,7 @@ fn test_decode_corpus_files() {
 
         let mut original_p = p.clone();
         original_p.truncate(original_p.len() - 4);
-        let original_f = fs::File::open(original_p).unwrap();
+        let original_f = BufReader::new(fs::File::open(original_p).unwrap());
         let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
 
         println!("Results for file: {}", p.clone());
diff --git a/src/tests/dict_test.rs b/src/tests/dict_test.rs
@@ -83,6 +83,7 @@ fn test_dict_decoding() {
     use alloc::string::{String, ToString};
     use alloc::vec::Vec;
     use std::fs;
+    use std::io::BufReader;
     use std::io::Read;
     use std::println;
 
@@ -97,7 +98,7 @@ fn test_dict_decoding() {
     let mut speeds_read = Vec::new();
 
     let mut files: Vec<_> = fs::read_dir("./dict_tests/files").unwrap().collect();
-    let dict = fs::File::open("./dict_tests/dictionary").unwrap();
+    let dict = BufReader::new(fs::File::open("./dict_tests/dictionary").unwrap());
     let dict: Vec<u8> = dict.bytes().map(|x| x.unwrap()).collect();
 
     files.sort_by_key(|x| match x {
@@ -155,7 +156,7 @@ fn test_dict_decoding() {
 
         let mut original_p = p.clone();
         original_p.truncate(original_p.len() - 4);
-        let original_f = fs::File::open(original_p).unwrap();
+        let original_f = BufReader::new(fs::File::open(original_p).unwrap());
         let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
 
         println!("Results for file: {}", p.clone());
diff --git a/src/tests/mod.rs b/src/tests/mod.rs
@@ -130,8 +130,9 @@ fn test_frame_decoder() {
 fn test_decode_from_to() {
     use crate::decoding::FrameDecoder;
     use std::fs::File;
+    use std::io::BufReader;
     use std::io::Read;
-    let f = File::open("./decodecorpus_files/z000088.zst").unwrap();
+    let f = BufReader::new(File::open("./decodecorpus_files/z000088.zst").unwrap());
     let mut frame_dec = FrameDecoder::new();
 
     let content: Vec<u8> = f.bytes().map(|x| x.unwrap()).collect();
@@ -197,7 +198,7 @@ fn test_decode_from_to() {
         None => std::println!("No checksums to test\n"),
     }
 
-    let original_f = File::open("./decodecorpus_files/z000088").unwrap();
+    let original_f = BufReader::new(File::open("./decodecorpus_files/z000088").unwrap());
     let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
 
     if original.len() != result.len() {
@@ -233,6 +234,7 @@ fn test_specific_file() {
     use crate::decoding::BlockDecodingStrategy;
     use crate::decoding::FrameDecoder;
     use std::fs;
+    use std::io::BufReader;
     use std::io::Read;
 
     let path = "./decodecorpus_files/z000068.zst";
@@ -256,7 +258,7 @@ fn test_specific_file() {
         .unwrap();
     let result = frame_dec.collect().unwrap();
 
-    let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap();
+    let original_f = BufReader::new(fs::File::open("./decodecorpus_files/z000088").unwrap());
     let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
 
     std::println!("Results for file: {}", path);
@@ -293,6 +295,7 @@ fn test_specific_file() {
 #[cfg(feature = "std")]
 fn test_streaming() {
     use std::fs;
+    use std::io::BufReader;
     use std::io::Read;
 
     let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap();
@@ -301,7 +304,7 @@ fn test_streaming() {
     let mut result = Vec::new();
     Read::read_to_end(&mut stream, &mut result).unwrap();
 
-    let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap();
+    let original_f = BufReader::new(fs::File::open("./decodecorpus_files/z000088").unwrap());
     let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
 
     if original.len() != result.len() {
@@ -343,7 +346,7 @@ fn test_streaming() {
     let mut result = Vec::new();
     Read::read_to_end(&mut stream, &mut result).unwrap();
 
-    let original_f = fs::File::open("./decodecorpus_files/z000068").unwrap();
+    let original_f = BufReader::new(fs::File::open("./decodecorpus_files/z000068").unwrap());
     let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
 
     std::println!("Results for file:");

Original file line number	Diff line number	Diff line change
`@@ -18,9 +18,7 @@ fn main() {`
`18`	`18`	`}`
`19`	`19`
`20`	`20`	`if flags.len() != 2 {`
`21`		`- eprintln!(`
`22`		`- "No flags other than -d and -c are currently implemented. Flags used: {flags:?}"`
`23`		`- );`
	`21`	`+ eprintln!("No flags other than -d and -c are currently implemented. Flags used: {flags:?}");`
`24`	`22`	`return;`
`25`	`23`	`}`
`26`	`24`