251 lines
7 KiB
Markdown
251 lines
7 KiB
Markdown
# marisa-rs
|
|
|
|
Safe Rust wrapper for the [marisa-trie](https://github.com/s-yata/marisa-trie) C++ library.
|
|
|
|
marisa-trie is a static and space-efficient trie data structure library. This crate provides safe Rust bindings to the C++ library.
|
|
|
|
## Installation
|
|
|
|
Add this to your `Cargo.toml`:
|
|
|
|
```toml
|
|
[dependencies]
|
|
marisa-rs = "0.1"
|
|
```
|
|
|
|
## Quick Start
|
|
|
|
```rust
|
|
use marisa_rs::{Keyset, Trie};
|
|
|
|
fn main() {
|
|
// Create a keyset and add words
|
|
let mut keyset = Keyset::new();
|
|
keyset.push("apple");
|
|
keyset.push("application");
|
|
keyset.push("apply");
|
|
|
|
// Build the trie
|
|
let mut trie = Trie::new();
|
|
trie.build(&mut keyset).unwrap();
|
|
|
|
// Lookup a word
|
|
if let Some(id) = trie.lookup("apple") {
|
|
println!("Found 'apple' with ID: {}", id);
|
|
}
|
|
|
|
// Search for words starting with "app"
|
|
trie.predictive_search("app", |word, id| {
|
|
println!("Found: {} (ID: {})", word, id);
|
|
});
|
|
}
|
|
```
|
|
|
|
## Basic Usage
|
|
|
|
### Creating and Building a Trie
|
|
|
|
```rust
|
|
use marisa_rs::{Keyset, Trie};
|
|
|
|
// Create a keyset
|
|
let mut keyset = Keyset::new();
|
|
|
|
// Add words to the keyset
|
|
keyset.push("cat");
|
|
keyset.push("car");
|
|
keyset.push("card");
|
|
keyset.push("care");
|
|
|
|
// Build the trie
|
|
let mut trie = Trie::new();
|
|
trie.build(&mut keyset)?;
|
|
```
|
|
|
|
### Saving and Loading Tries
|
|
|
|
```rust
|
|
use marisa_rs::{Keyset, Trie};
|
|
|
|
// Build a trie
|
|
let mut keyset = Keyset::new();
|
|
keyset.push("hello");
|
|
keyset.push("world");
|
|
|
|
let mut trie = Trie::new();
|
|
trie.build(&mut keyset)?;
|
|
|
|
// Save the trie to a file
|
|
trie.save("my_trie.marisa")?;
|
|
|
|
// Load the trie from a file
|
|
let mut loaded_trie = Trie::new();
|
|
loaded_trie.load("my_trie.marisa")?;
|
|
|
|
// Or use memory mapping for better performance with large tries
|
|
let mut mmapped_trie = Trie::new();
|
|
mmapped_trie.mmap("my_trie.marisa")?;
|
|
|
|
// Check the serialized size before saving
|
|
println!("Trie size: {} bytes", trie.io_size());
|
|
```
|
|
|
|
### RecordTrie Usage
|
|
|
|
RecordTrie allows storing multiple structured records for each key:
|
|
|
|
```rust
|
|
use marisa_rs::RecordTrie;
|
|
|
|
// Create a builder
|
|
let mut builder = RecordTrie::builder();
|
|
|
|
// Add structured data (supports duplicate keys)
|
|
builder.insert_u32_pair("apple", (1, 100)); // price, quantity
|
|
builder.insert_u32_pair("apple", (2, 50)); // different record for same key
|
|
builder.insert_u32_pair("banana", (3, 200));
|
|
|
|
// Add vector data
|
|
builder.insert_u32_vec("features", vec![1, 2, 3, 4]);
|
|
builder.insert_u32_vec("features", vec![5, 6]); // different length
|
|
|
|
// Add raw binary data
|
|
builder.insert("description", b"Fresh fruit".to_vec());
|
|
|
|
// Build the RecordTrie
|
|
let record_trie = builder.build().unwrap();
|
|
|
|
// Retrieve all records for a key
|
|
let apple_records = record_trie.get_u32_pairs("apple");
|
|
println!("Apple records: {:?}", apple_records); // [(1, 100), (2, 50)]
|
|
|
|
let feature_vecs = record_trie.get_u32_vecs("features");
|
|
println!("Feature vectors: {:?}", feature_vecs); // [[1, 2, 3, 4], [5, 6]]
|
|
|
|
// Prefix search works too
|
|
let fruit_keys = record_trie.keys_with_prefix("a");
|
|
println!("Keys starting with 'a': {:?}", fruit_keys); // ["apple"]
|
|
|
|
// Save and load
|
|
record_trie.save("trie.marisa", "records.json").unwrap();
|
|
let loaded_trie = RecordTrie::load("trie.marisa", "records.json").unwrap();
|
|
```
|
|
|
|
### Lookup Operations
|
|
|
|
```rust
|
|
// Exact lookup
|
|
match trie.lookup("car") {
|
|
Some(id) => println!("Found with ID: {}", id),
|
|
None => println!("Not found"),
|
|
}
|
|
|
|
// Reverse lookup (get word by ID)
|
|
match trie.reverse_lookup(0) {
|
|
Ok(word) => println!("ID 0 corresponds to: {}", word),
|
|
Err(_) => println!("Invalid ID"),
|
|
}
|
|
```
|
|
|
|
### Search Operations
|
|
|
|
```rust
|
|
// Find all prefixes of a word
|
|
trie.common_prefix_search("cards", |word, id| {
|
|
println!("Prefix: {} (ID: {})", word, id);
|
|
// Output: "car", "card"
|
|
});
|
|
|
|
// Find all words starting with a prefix
|
|
trie.predictive_search("car", |word, id| {
|
|
println!("Word: {} (ID: {})", word, id);
|
|
// Output: "car", "card", "care"
|
|
});
|
|
```
|
|
|
|
### Working with Weights
|
|
|
|
```rust
|
|
let mut keyset = Keyset::new();
|
|
|
|
// Add words with custom weights
|
|
keyset.push_back("important", 10.0);
|
|
keyset.push_back("normal", 1.0);
|
|
keyset.push_back("less_important", 0.1);
|
|
|
|
let mut trie = Trie::new();
|
|
trie.build(&mut keyset)?;
|
|
```
|
|
|
|
## API Reference
|
|
|
|
### Keyset
|
|
|
|
- `Keyset::new()` - Create a new empty keyset
|
|
- `keyset.push(key)` - Add a key with default weight (1.0)
|
|
- `keyset.push_back(key, weight)` - Add a key with specified weight
|
|
- `keyset.size()` - Get the number of keys
|
|
- `keyset.is_empty()` - Check if the keyset is empty
|
|
|
|
### Trie
|
|
|
|
- `Trie::new()` - Create a new empty trie
|
|
- `trie.build(&mut keyset)` - Build the trie from a keyset
|
|
- `trie.lookup(key)` - Find the ID of a key (returns `Option<usize>`)
|
|
- `trie.reverse_lookup(id)` - Find the key for an ID (returns `Result<String, &str>`)
|
|
- `trie.common_prefix_search(query, callback)` - Find all keys that are prefixes of query
|
|
- `trie.predictive_search(query, callback)` - Find all keys that start with query
|
|
- `trie.size()` - Get the number of keys in the trie
|
|
- `trie.is_empty()` - Check if the trie is empty
|
|
- `trie.save(path)` - Save the trie to a file (returns `Result<(), &str>`)
|
|
- `trie.load(path)` - Load a trie from a file (returns `Result<(), &str>`)
|
|
- `trie.mmap(path)` - Memory-map a trie file for efficient read-only access (returns `Result<(), &str>`)
|
|
- `trie.io_size()` - Get the serialized size of the trie in bytes
|
|
- `trie.clear()` - Clear the trie, removing all keys (returns `Result<(), &str>`)
|
|
|
|
### RecordTrie
|
|
|
|
RecordTrie allows storing structured data associated with keys, similar to Python's marisa-trie RecordTrie:
|
|
|
|
- `RecordTrie::builder()` - Create a new RecordTrie builder
|
|
- `builder.insert(key, data)` - Insert raw binary data for a key
|
|
- `builder.insert_u32_pair(key, (a, b))` - Insert a pair of u32 values
|
|
- `builder.insert_u32_vec(key, vec![...])` - Insert a vector of u32 values
|
|
- `builder.build()` - Build the RecordTrie
|
|
- `trie.get(key)` - Get all raw binary records for a key
|
|
- `trie.get_u32_pairs(key)` - Get all u32 pairs for a key
|
|
- `trie.get_u32_vecs(key)` - Get all u32 vectors for a key
|
|
- `trie.contains_key(key)` - Check if a key exists
|
|
- `trie.keys_with_prefix(prefix)` - Find all keys with given prefix
|
|
- `trie.prefixes_of(query)` - Find all keys that are prefixes of query
|
|
- `trie.save(trie_path, records_path)` - Save to files
|
|
- `RecordTrie::load(trie_path, records_path)` - Load from files
|
|
|
|
## Japanese Text Example
|
|
|
|
```rust
|
|
use marisa_rs::{Keyset, Trie};
|
|
|
|
let mut keyset = Keyset::new();
|
|
keyset.push("あ"); // a
|
|
keyset.push("あい"); // ai (love)
|
|
keyset.push("あいて"); // aite (partner)
|
|
|
|
let mut trie = Trie::new();
|
|
trie.build(&mut keyset).unwrap();
|
|
|
|
// Works with UTF-8 strings
|
|
if let Some(id) = trie.lookup("あい") {
|
|
println!("Found Japanese word with ID: {}", id);
|
|
}
|
|
```
|
|
|
|
## Thread Safety
|
|
|
|
All types (`Keyset`, `Trie`, `Agent`) implement `Send` and can be transferred between threads. However, they are not `Sync` and cannot be shared between threads without additional synchronization.
|
|
|
|
## License
|
|
|
|
This project is licensed under LGPL, Version 2.0
|
|
|
|
This crate is built on top of the excellent [marisa-trie](https://github.com/s-yata/marisa-trie) library by Susumu Yata.
|