Skip to content

Commit a710b22

Browse files
committed
docs: update readme.md to include dict builder
1 parent 5213ef7 commit a710b22

File tree

13 files changed

+97
-71
lines changed

13 files changed

+97
-71
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
**/*.rs.bk
33
Cargo.lock
44
/local_corpus_files
5+
/local_dict_corpus_files
56
/orig-zstd
67
fuzz_decodecorpus
78
perf.data*

Cargo.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ categories = ["compression"]
1515
[dependencies]
1616
twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"], optional = true }
1717

18-
# Internal feature, only used when building as part of libstd, not part of the
18+
# Internal feature, only used when building as part of libstd, not part of theea
1919
# stable interface of this crate.
2020
compiler_builtins = { version = "0.1.2", optional = true }
2121
core = { version = "1.0.0", optional = true, package = "rustc-std-workspace-core" }
@@ -33,6 +33,7 @@ default = ["hash", "std"]
3333
hash = ["dep:twox-hash"]
3434
fuzz_exports = []
3535
std = []
36+
dict_builder = ["std"]
3637

3738
# Internal feature, only used when building as part of libstd, not part of the
3839
# stable interface of this crate.
@@ -52,4 +53,4 @@ required-features = ["std"]
5253

5354
[[bin]]
5455
name = "zstd_dict"
55-
required-features = ["std"]
56+
required-features = ["std", "dict_builder"]

Readme.md

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,49 @@ This crate is currently actively maintained.
1515

1616
# Current Status
1717

18-
Feature complete on the decoder side.
18+
## Decompression
19+
The `decoding` module provides a complete
20+
implementation of a Zstandard decompressor.
21+
22+
In terms of speed, `ruzstd` is behind the original C implementation
23+
which has a rust binding located [here](https://github.com/gyscos/zstd-rs).
1924

25+
Measuring with the 'time' utility the original zstd and my decoder both
26+
decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5
27+
times slower. Enwik9 is highly compressible, for less compressible data
28+
(like a ubuntu installation .iso) my decoder comes close to only being
29+
1.4 times slower.
30+
31+
## Compression
2032
On the compression side:
2133
- Support for generating compressed blocks at any compression level
2234
- [x] Uncompressed
2335
- [x] Fastest (roughly level 1)
2436
- [ ] Default (roughly level 3)
2537
- [ ] Better (roughly level 7)
2638
- [ ] Best (roughly level 11)
27-
- [ ] Checksums
39+
- [x] Checksums
2840
- [ ] Dictionaries
2941

30-
## Speed
31-
In terms of speed this library is behind the original C implementation which has a rust binding located [here](https://github.com/gyscos/zstd-rs).
42+
## Dictionary Generation
43+
When the `dict_builder` feature is enabled, the `dictionary` module
44+
provides the ability to create new dictionaries.
45+
46+
On the `github-users` sample set, our implementation benchmarks within
47+
0.2% of the official implementation (as of commit
48+
`09e52d07340acdb2e13817b066e8be6e424f7258`):
49+
```
50+
uncompressed: 100.00% (7484607 bytes)
51+
no dict: 34.99% of original size (2618872 bytes)
52+
reference dict: 16.16% of no dict size (2195672 bytes smaller)
53+
our dict: 16.28% of no dict size (2192400 bytes smaller)
54+
```
55+
56+
The dictionary generator only provides support for creating "raw
57+
content" dictionaries. Tagged dictionaries are currently unsupported.
3258

33-
Measuring with the 'time' utility the original zstd and my decoder both decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5 times slower. Enwik9 is highly compressible, for less compressible data (like a ubuntu installation .iso) my decoder comes close to only being 1.4 times slower.
59+
See <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format>
60+
for clarification.
3461

3562

3663
# How can you use it?

src/bin/zstd.rs

Lines changed: 33 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,7 @@ fn decompress(flags: &[String], file_paths: &[String]) {
3434
}
3535

3636
if flags.len() != 2 {
37-
eprintln!(
38-
"No flags other than -d and -c are currently implemented. Flags used: {flags:?}"
39-
);
37+
eprintln!("No flags other than -d and -c are currently implemented. Flags used: {flags:?}");
4038
return;
4139
}
4240

@@ -156,40 +154,40 @@ fn main() {
156154
let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect();
157155
let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect();
158156
file_paths.remove(0);
159-
157+
160158
if flags.is_empty() {
161-
let mut encoder = FrameCompressor::new(CompressionLevel::Fastest);
162-
encoder.set_drain(Vec::new());
163-
164-
for path in file_paths {
165-
let start_instant = Instant::now();
166-
let file = std::fs::File::open(&path).unwrap();
167-
let input_len = file.metadata().unwrap().len() as usize;
168-
let file = PercentPrintReader {
169-
reader: BufReader::new(file),
170-
total: input_len,
171-
counter: 0,
172-
last_percent: 0,
173-
};
174-
encoder.set_source(file);
175-
encoder.compress();
176-
let mut output: Vec<_> = encoder.take_drain().unwrap();
177-
println!(
178-
"Compressed {path:} from {} to {} ({}%) took {}ms",
179-
input_len,
180-
output.len(),
181-
if input_len == 0 {
182-
0
183-
} else {
184-
output.len() * 100 / input_len
185-
},
186-
start_instant.elapsed().as_millis()
187-
);
188-
output.clear();
189-
encoder.set_drain(output);
190-
}
159+
let mut encoder = FrameCompressor::new(CompressionLevel::Fastest);
160+
encoder.set_drain(Vec::new());
161+
162+
for path in file_paths {
163+
let start_instant = Instant::now();
164+
let file = std::fs::File::open(&path).unwrap();
165+
let input_len = file.metadata().unwrap().len() as usize;
166+
let file = PercentPrintReader {
167+
reader: BufReader::new(file),
168+
total: input_len,
169+
counter: 0,
170+
last_percent: 0,
171+
};
172+
encoder.set_source(file);
173+
encoder.compress();
174+
let mut output: Vec<_> = encoder.take_drain().unwrap();
175+
println!(
176+
"Compressed {path:} from {} to {} ({}%) took {}ms",
177+
input_len,
178+
output.len(),
179+
if input_len == 0 {
180+
0
181+
} else {
182+
output.len() * 100 / input_len
183+
},
184+
start_instant.elapsed().as_millis()
185+
);
186+
output.clear();
187+
encoder.set_drain(output);
188+
}
191189
} else {
192-
decompress(&flags, &file_paths);
190+
decompress(&flags, &file_paths);
193191
}
194192
}
195193

src/bin/zstd_dict.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use ruzstd::dictionary::{create_raw_dict_from_dir, create_raw_dict_from_source};
2+
use std::env::args;
23
use std::fs::File;
34
use std::path::Path;
4-
use std::env::args;
55

66
fn main() {
77
let args: Vec<String> = args().collect();

src/bin/zstd_stream.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,7 @@ fn main() {
1818
}
1919

2020
if flags.len() != 2 {
21-
eprintln!(
22-
"No flags other than -d and -c are currently implemented. Flags used: {flags:?}"
23-
);
21+
eprintln!("No flags other than -d and -c are currently implemented. Flags used: {flags:?}");
2422
return;
2523
}
2624

src/dictionary/cover.rs

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
//! described in the paper "Effective Construction of Relative Lempel-Ziv Dictionaries",
33
//! by Liao, Petri, Moffat, and Wirth, published under the University of Melbourne.
44
//!
5-
//! See: https://people.eng.unimelb.edu.au/ammoffat/abstracts/lpmw16www.pdf
5+
//! See: <https://people.eng.unimelb.edu.au/ammoffat/abstracts/lpmw16www.pdf>
66
//!
77
//! Facebook's implementation was also used as a reference.
8-
//! https://github.com/facebook/zstd/tree/dev/lib/dictBuilder
8+
//! <https://github.com/facebook/zstd/tree/dev/lib/dictBuilder>
99
1010
use super::DictParams;
1111
use crate::dictionary::frequency::estimate_frequency;
@@ -43,11 +43,7 @@ impl PartialEq for Segment {
4343

4444
impl PartialOrd for Segment {
4545
fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
46-
match self.score.partial_cmp(&other.score) {
47-
Some(core::cmp::Ordering::Equal) => {}
48-
ord => return ord,
49-
}
50-
self.score.partial_cmp(&other.score)
46+
Some(self.cmp(other))
5147
}
5248
}
5349

@@ -68,15 +64,15 @@ pub struct Context {
6864

6965
/// Returns the highest scoring segment in an epoch
7066
/// as a slice of that epoch.
71-
pub fn pick_best_segment<'epoch>(
67+
pub fn pick_best_segment(
7268
params: &DictParams,
7369
ctx: &mut Context,
74-
collection_sample: &'epoch [u8],
70+
collection_sample: &'_ [u8],
7571
) -> Segment {
7672
let mut segments = collection_sample
7773
.chunks(params.segment_size as usize)
7874
.peekable();
79-
let mut best_segment: &[u8] = &segments.peek().expect("at least one segment");
75+
let mut best_segment: &[u8] = segments.peek().expect("at least one segment");
8076
let mut top_segment_score: usize = 0;
8177
// Iterate over segments and score each segment, keeping track of the best segment
8278
for segment in segments {
@@ -107,7 +103,7 @@ fn score_segment(ctx: &mut Context, collection_sample: &[u8], segment: &[u8]) ->
107103
if ctx.frequencies.contains_key(kmer) {
108104
continue;
109105
}
110-
let kmer_score = estimate_frequency(kmer, &collection_sample);
106+
let kmer_score = estimate_frequency(kmer, collection_sample);
111107
ctx.frequencies.insert(*kmer, kmer_score);
112108
segment_score += kmer_score;
113109
}

src/dictionary/mod.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ pub struct DictParams {
5050
/// As found under "4. Experiments - Varying Segment Size" in the original paper, a
5151
/// segment size of 2 kiB was effective.
5252
///
53-
/// "We explored a range of [segment_size] values and found the performance of LMC is insensitive
54-
/// to [segment_size]. We fix [segment_size] to 2kiB
53+
/// "We explored a range of \[`segment_size`\] values and found the performance of LMC is insensitive
54+
/// to \[`segment_size`\]. We fix \[`segment_size`\] to 2kiB
5555
///
5656
/// Reasonable range: [16, 2048+]
5757
pub segment_size: u32,
@@ -85,7 +85,7 @@ pub fn create_raw_dict_from_dir<P: AsRef<Path>, W: io::Write>(
8585
for entry in dir {
8686
let entry = entry?;
8787
if entry.file_type()?.is_dir() {
88-
recurse_read(fs::read_dir(&entry.path())?, file_paths)?;
88+
recurse_read(fs::read_dir(entry.path())?, file_paths)?;
8989
} else {
9090
file_paths.push(entry.path());
9191
}

src/dictionary/reservoir.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use super::cover::K;
22
use alloc::vec::Vec;
33
use core::f64::consts::E;
44
use fastrand;
5-
use std::io;
5+
use std::{io, vec};
66

77
/// Creates a representative sample of `input` of `size` bytes.
88
pub fn create_sample<R: io::Read>(input: &mut R, size: usize) -> Vec<u8> {
@@ -31,8 +31,7 @@ impl Reservoir {
3131
/// Initialize a new empty reservoir, creating an allocation of `size`.
3232
pub fn new(size: usize) -> Self {
3333
assert!(size >= 16, "Reservoirs cannot be below 16 bytes in size");
34-
let mut lake = Vec::with_capacity(size);
35-
lake.resize(size, 0);
34+
let lake: Vec<u8> = vec![0; size];
3635
let k = K as u16;
3736
Self { lake, k }
3837
}

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ macro_rules! vprintln {
3535
mod bit_io;
3636
mod common;
3737
pub mod decoding;
38+
#[cfg(feature = "dict_builder")]
3839
pub mod dictionary;
3940
pub mod encoding;
4041

0 commit comments

Comments
 (0)