@@ -57,6 +57,52 @@ let next_state = index.next_state(&initial_state, token_id);
57
57
let final_states = index . final_states ();
58
58
```
59
59
60
+ ### Vocabulary
61
+
62
+ You can create a ` Vocabulary ` in three ways:
63
+
64
+ 1 . ** ` Vocabulary::from_pretrained(model, parameters) ` ** - Loads from a pretrained model (as in the example above)
65
+
66
+ 2 . ** Manual creation** - You can create a vocabulary from token mappings:
67
+
68
+ 1 . ** ` Vocabulary::new(eos_token_id) ` ** - Creates an empty vocabulary, then add tokens with ` try_insert() ` :
69
+
70
+ ``` rust
71
+ let mut vocabulary = Vocabulary :: new (50256 );
72
+ vocabulary . try_insert (" hello" , 0 )? ;
73
+ vocabulary . try_insert (vec! [32 ], 1 )? ;
74
+ ```
75
+
76
+ 2 . ** `Vocabulary :: try_from ((eos_token_id , tokens ))`** - Creates a vocabulary by directly providing the token mappings .
77
+
78
+ - It can be done either with the tokens as strings :
79
+
80
+ ```rust
81
+ use rustc_hash :: FxHashMap as HashMap ;
82
+
83
+ let eos_token_id : u32 = 50256 ;
84
+ let mut tokens : HashMap <String , Vec <u32 >> = HashMap :: default ();
85
+ tokens . insert (" hello" . to_string (), vec! [0 ]);
86
+ tokens . insert (" world" . to_string (), vec! [1 ]);
87
+
88
+ let vocabulary = Vocabulary :: try_from ((eos_token_id , tokens ))? ;
89
+ ```
90
+
91
+ - Or with the tokens as byte vector keys :
92
+
93
+ ```rust
94
+ use rustc_hash :: FxHashMap as HashMap ;
95
+
96
+ let eos_token_id : u32 = 50256 ;
97
+ let mut tokens : HashMap <Vec <u8 >, Vec <u32 >> = HashMap :: default ();
98
+ tokens . insert (b " hello" . to_vec (), vec! [0 ]);
99
+ tokens . insert (b " world" . to_vec (), vec! [1 ]);
100
+
101
+ let vocabulary = Vocabulary :: try_from ((eos_token_id , tokens ))? ;
102
+ ```
103
+
104
+ * * Important ** : When creating a `Vocabulary ` manually from tokenizer data , ensure tokens are converted to their string representations to replace special tokens that wouldn 't be recognized by the DFA .
105
+
60
106
## Python Bindings
61
107
62
108
Additionally , project provides interfaces to integrate the crate 's functionality with Python .
0 commit comments