This commit is contained in:
Soma Nakamura 2025-07-24 02:24:10 +09:00
commit c8b3c3c44f
11 changed files with 1441 additions and 0 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
/target/
Cargo.lock

457
Cargo.lock generated Normal file
View file

@ -0,0 +1,457 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "bindgen"
version = "0.69.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
dependencies = [
"bitflags",
"cexpr",
"clang-sys",
"itertools",
"lazy_static",
"lazycell",
"log",
"prettyplease",
"proc-macro2",
"quote",
"regex",
"rustc-hash",
"shlex",
"syn",
"which",
]
[[package]]
name = "bitflags"
version = "2.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967"
[[package]]
name = "cc"
version = "1.2.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "deec109607ca693028562ed836a5f1c4b8bd77755c4e132fc5ce11b0b6211ae7"
dependencies = [
"shlex",
]
[[package]]
name = "cexpr"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
dependencies = [
"nom",
]
[[package]]
name = "cfg-if"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268"
[[package]]
name = "clang-sys"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
dependencies = [
"glob",
"libc",
"libloading",
]
[[package]]
name = "either"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
[[package]]
name = "errno"
version = "0.3.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad"
dependencies = [
"libc",
"windows-sys 0.60.2",
]
[[package]]
name = "glob"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2"
[[package]]
name = "home"
version = "0.5.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf"
dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "itertools"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
dependencies = [
"either",
]
[[package]]
name = "lazy_static"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "lazycell"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "libc"
version = "0.2.174"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776"
[[package]]
name = "libloading"
version = "0.8.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
dependencies = [
"cfg-if",
"windows-targets 0.53.2",
]
[[package]]
name = "linux-raw-sys"
version = "0.4.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
[[package]]
name = "log"
version = "0.4.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
[[package]]
name = "marisa-rs"
version = "0.1.0"
dependencies = [
"bindgen",
"cc",
"libc",
"pkg-config",
]
[[package]]
name = "memchr"
version = "2.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
[[package]]
name = "minimal-lexical"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
[[package]]
name = "nom"
version = "7.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
dependencies = [
"memchr",
"minimal-lexical",
]
[[package]]
name = "once_cell"
version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "pkg-config"
version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "prettyplease"
version = "0.2.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "061c1221631e079b26479d25bbf2275bfe5917ae8419cd7e34f13bfc2aa7539a"
dependencies = [
"proc-macro2",
"syn",
]
[[package]]
name = "proc-macro2"
version = "1.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
dependencies = [
"proc-macro2",
]
[[package]]
name = "regex"
version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "rustc-hash"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustix"
version = "0.38.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
dependencies = [
"bitflags",
"errno",
"libc",
"linux-raw-sys",
"windows-sys 0.59.0",
]
[[package]]
name = "shlex"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "syn"
version = "2.0.104"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "unicode-ident"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
[[package]]
name = "which"
version = "4.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
dependencies = [
"either",
"home",
"once_cell",
"rustix",
]
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "windows-sys"
version = "0.60.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
dependencies = [
"windows-targets 0.53.2",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm 0.52.6",
"windows_aarch64_msvc 0.52.6",
"windows_i686_gnu 0.52.6",
"windows_i686_gnullvm 0.52.6",
"windows_i686_msvc 0.52.6",
"windows_x86_64_gnu 0.52.6",
"windows_x86_64_gnullvm 0.52.6",
"windows_x86_64_msvc 0.52.6",
]
[[package]]
name = "windows-targets"
version = "0.53.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef"
dependencies = [
"windows_aarch64_gnullvm 0.53.0",
"windows_aarch64_msvc 0.53.0",
"windows_i686_gnu 0.53.0",
"windows_i686_gnullvm 0.53.0",
"windows_i686_msvc 0.53.0",
"windows_x86_64_gnu 0.53.0",
"windows_x86_64_gnullvm 0.53.0",
"windows_x86_64_msvc 0.53.0",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_aarch64_msvc"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnu"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_gnullvm"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_i686_msvc"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnu"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "windows_x86_64_msvc"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486"

22
Cargo.toml Normal file
View file

@ -0,0 +1,22 @@
[package]
name = "marisa-rs"
version = "0.1.0"
edition = "2021"
description = "Safe Rust wrapper for the marisa-trie C++ library - a static memory-efficient trie data structure. Requires marisa-trie system library."
license = "MIT OR Apache-2.0"
readme = "README.md"
homepage = "https://crates.io/crates/marisa-rs"
keywords = ["trie", "string", "search", "marisa"]
categories = ["data-structures", "text-processing"]
[lib]
name = "marisa_rs"
crate-type = ["cdylib", "rlib"]
[dependencies]
libc = "0.2"
[build-dependencies]
cc = "1.0"
pkg-config = "0.3"
bindgen = "0.69"

178
README.md Normal file
View file

@ -0,0 +1,178 @@
# marisa-rs
Safe Rust wrapper for the [marisa-trie](https://github.com/s-yata/marisa-trie) C++ library.
marisa-trie is a static and space-efficient trie data structure library. This crate provides safe Rust bindings to the C++ library.
## Installation
Add this to your `Cargo.toml`:
```toml
[dependencies]
marisa-rs = "0.1"
```
## Requirements
This crate requires the marisa-trie C++ library to be installed on your system.
### Ubuntu/Debian
```bash
sudo apt-get install libmarisa-dev
```
### macOS
```bash
brew install marisa-trie
```
## Quick Start
```rust
use marisa_rs::{Keyset, Trie};
fn main() {
// Create a keyset and add words
let mut keyset = Keyset::new();
keyset.push("apple");
keyset.push("application");
keyset.push("apply");
// Build the trie
let mut trie = Trie::new();
trie.build(&mut keyset).unwrap();
// Lookup a word
if let Some(id) = trie.lookup("apple") {
println!("Found 'apple' with ID: {}", id);
}
// Search for words starting with "app"
trie.predictive_search("app", |word, id| {
println!("Found: {} (ID: {})", word, id);
});
}
```
## Basic Usage
### Creating and Building a Trie
```rust
use marisa_rs::{Keyset, Trie};
// Create a keyset
let mut keyset = Keyset::new();
// Add words to the keyset
keyset.push("cat");
keyset.push("car");
keyset.push("card");
keyset.push("care");
// Build the trie
let mut trie = Trie::new();
trie.build(&mut keyset)?;
```
### Lookup Operations
```rust
// Exact lookup
match trie.lookup("car") {
Some(id) => println!("Found with ID: {}", id),
None => println!("Not found"),
}
// Reverse lookup (get word by ID)
match trie.reverse_lookup(0) {
Ok(word) => println!("ID 0 corresponds to: {}", word),
Err(_) => println!("Invalid ID"),
}
```
### Search Operations
```rust
// Find all prefixes of a word
trie.common_prefix_search("cards", |word, id| {
println!("Prefix: {} (ID: {})", word, id);
// Output: "car", "card"
});
// Find all words starting with a prefix
trie.predictive_search("car", |word, id| {
println!("Word: {} (ID: {})", word, id);
// Output: "car", "card", "care"
});
```
### Working with Weights
```rust
let mut keyset = Keyset::new();
// Add words with custom weights
keyset.push_back("important", 10.0);
keyset.push_back("normal", 1.0);
keyset.push_back("less_important", 0.1);
let mut trie = Trie::new();
trie.build(&mut keyset)?;
```
## API Reference
### Keyset
- `Keyset::new()` - Create a new empty keyset
- `keyset.push(key)` - Add a key with default weight (1.0)
- `keyset.push_back(key, weight)` - Add a key with specified weight
- `keyset.size()` - Get the number of keys
- `keyset.is_empty()` - Check if the keyset is empty
### Trie
- `Trie::new()` - Create a new empty trie
- `trie.build(&mut keyset)` - Build the trie from a keyset
- `trie.lookup(key)` - Find the ID of a key (returns `Option<usize>`)
- `trie.reverse_lookup(id)` - Find the key for an ID (returns `Result<String, &str>`)
- `trie.common_prefix_search(query, callback)` - Find all keys that are prefixes of query
- `trie.predictive_search(query, callback)` - Find all keys that start with query
- `trie.size()` - Get the number of keys in the trie
- `trie.is_empty()` - Check if the trie is empty
## Japanese Text Example
```rust
use marisa_rs::{Keyset, Trie};
let mut keyset = Keyset::new();
keyset.push("あ"); // a
keyset.push("あい"); // ai (love)
keyset.push("あいて"); // aite (partner)
let mut trie = Trie::new();
trie.build(&mut keyset).unwrap();
// Works with UTF-8 strings
if let Some(id) = trie.lookup("あい") {
println!("Found Japanese word with ID: {}", id);
}
```
## Thread Safety
All types (`Keyset`, `Trie`, `Agent`) implement `Send` and can be transferred between threads. However, they are not `Sync` and cannot be shared between threads without additional synchronization.
## License
This project is licensed under either of
* Apache License, Version 2.0
* MIT license
at your option.
This crate is built on top of the excellent [marisa-trie](https://github.com/s-yata/marisa-trie) library by Susumu Yata.

54
build.rs Normal file
View file

@ -0,0 +1,54 @@
use std::env;
use std::path::PathBuf;
fn main() {
println!("cargo:rerun-if-changed=wrapper.cpp");
println!("cargo:rerun-if-changed=wrapper.h");
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
if pkg_config::Config::new()
.atleast_version("0.2.0")
.probe("marisa")
.is_ok()
{
println!("cargo:rustc-link-lib=static=marisa");
} else {
println!("cargo:rustc-link-lib=static=marisa");
println!("cargo:rustc-link-search=native=/usr/local/lib");
println!("cargo:rustc-link-search=native=/usr/lib");
}
let mut build = cc::Build::new();
build
.cpp(true)
.file("wrapper.cpp")
.flag_if_supported("-std=c++17");
if let Ok(cpath) = env::var("CPATH") {
for path in cpath.split(':') {
if !path.is_empty() {
build.include(path);
}
}
}
build
.include("/usr/local/include")
.include("/usr/include");
build.compile("marisa_wrapper");
let bindings = bindgen::Builder::default()
.header("wrapper.h")
.parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
.rust_target(bindgen::RustTarget::Stable_1_47)
.generate_inline_functions(false)
.generate_comments(false)
.generate()
.expect("Unable to generate bindings");
bindings
.write_to_file(out_path.join("bindings.rs"))
.expect("Couldn't write bindings!");
}

96
flake.lock generated Normal file
View file

@ -0,0 +1,96 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1731533236,
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1752950548,
"narHash": "sha256-NS6BLD0lxOrnCiEOcvQCDVPXafX1/ek1dfJHX1nUIzc=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "c87b95e25065c028d31a94f06a62927d18763fdf",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"nixpkgs_2": {
"locked": {
"lastModified": 1744536153,
"narHash": "sha256-awS2zRgF4uTwrOKwwiJcByDzDOdo3Q1rPZbiHQg/N38=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "18dd725c29603f582cf1900e0d25f9f1063dbf11",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixpkgs-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs",
"rust-overlay": "rust-overlay"
}
},
"rust-overlay": {
"inputs": {
"nixpkgs": "nixpkgs_2"
},
"locked": {
"lastModified": 1753238793,
"narHash": "sha256-jmQeEpgX+++MEgrcikcwoSiI7vDZWLP0gci7XiWb9uQ=",
"owner": "oxalica",
"repo": "rust-overlay",
"rev": "0ad7ab4ca8e83febf147197e65c006dff60623ab",
"type": "github"
},
"original": {
"owner": "oxalica",
"repo": "rust-overlay",
"type": "github"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

85
flake.nix Normal file
View file

@ -0,0 +1,85 @@
{
description = "Rust wrapper for marisa-trie C++ library";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
rust-overlay.url = "github:oxalica/rust-overlay";
flake-utils.url = "github:numtide/flake-utils";
};
outputs = { self, nixpkgs, rust-overlay, flake-utils }:
flake-utils.lib.eachDefaultSystem (system:
let
overlays = [ (import rust-overlay) ];
pkgs = import nixpkgs {
inherit system overlays;
};
rustToolchain = pkgs.rust-bin.stable.latest.default.override {
extensions = [ "rust-src" "clippy" "rustfmt" ];
};
# Use nixpkgs marisa-trie package
marisa-trie = pkgs.marisa;
in
{
devShells.default = pkgs.mkShell {
buildInputs = with pkgs; [
rustToolchain
marisa-trie
# Build tools
cmake
gcc
pkg-config
clang
llvmPackages.libclang.lib
# Development tools
rust-analyzer
clippy
rustfmt
# C++ development
gdb
valgrind
];
shellHook = ''
echo "Rust marisa-trie development environment"
echo "Rust version: $(rustc --version)"
echo "Cargo version: $(cargo --version)"
echo "marisa-trie library available at: ${marisa-trie}"
export PKG_CONFIG_PATH="${marisa-trie}/lib/pkgconfig:$PKG_CONFIG_PATH"
export LD_LIBRARY_PATH="${marisa-trie}/lib:$LD_LIBRARY_PATH"
export LIBRARY_PATH="${marisa-trie}/lib:$LIBRARY_PATH"
export CPATH="${marisa-trie}/include:$CPATH"
export LIBCLANG_PATH="${pkgs.llvmPackages.libclang.lib}/lib"
'';
RUST_SRC_PATH = "${rustToolchain}/lib/rustlib/src/rust/library";
};
packages.default = pkgs.rustPlatform.buildRustPackage {
pname = "marisa-rs";
version = "0.1.0";
src = ./.;
cargoLock = {
lockFile = ./Cargo.lock;
};
buildInputs = [ marisa-trie ];
nativeBuildInputs = with pkgs; [ cmake gcc pkg-config ];
meta = with pkgs.lib; {
description = "Rust wrapper for marisa-trie C++ library";
license = with licenses; [ mit asl20 ];
platforms = platforms.unix;
};
};
});
}

333
src/lib.rs Normal file
View file

@ -0,0 +1,333 @@
//! # marisa-rs
//!
//! Safe Rust wrapper for the marisa-trie C++ library.
//!
//! marisa-trie is a static and space-efficient trie data structure library.
//! This crate provides safe Rust bindings to the C++ library.
//!
//! ## Example
//!
//! ```rust
//! use marisa_rs::{Keyset, Trie};
//!
//! let mut keyset = Keyset::new();
//! keyset.push("apple");
//! keyset.push("application");
//! keyset.push("apply");
//!
//! let mut trie = Trie::new();
//! trie.build(&mut keyset).unwrap();
//!
//! // Lookup
//! assert!(trie.lookup("apple").is_some());
//! assert!(trie.lookup("orange").is_none());
//!
//! // Common prefix search
//! trie.common_prefix_search("application", |key, id| {
//! println!("Found: {} (ID: {})", key, id);
//! });
//! ```
use std::slice;
mod bindings {
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
}
use bindings::*;
/// A keyset for building a trie.
///
/// Keyset is used to store a collection of keys before building a trie.
/// Keys can be added with different weights.
pub struct Keyset {
inner: *mut MarisaKeyset,
}
impl Keyset {
/// Creates a new empty keyset.
pub fn new() -> Self {
unsafe {
let inner = marisa_keyset_new();
Keyset { inner }
}
}
/// Adds a key with the specified weight to the keyset.
pub fn push_back(&mut self, key: &str, weight: f32) {
let key_bytes = key.as_bytes();
unsafe {
marisa_keyset_push_back(
self.inner,
key_bytes.as_ptr() as *const i8,
key_bytes.len(),
weight,
);
}
}
/// Adds a key with default weight (1.0) to the keyset.
pub fn push(&mut self, key: &str) {
self.push_back(key, 1.0);
}
/// Returns the number of keys in the keyset.
pub fn size(&self) -> usize {
unsafe { marisa_keyset_size(self.inner) }
}
/// Returns true if the keyset is empty.
pub fn is_empty(&self) -> bool {
self.size() == 0
}
}
impl Drop for Keyset {
fn drop(&mut self) {
unsafe {
marisa_keyset_delete(self.inner);
}
}
}
/// A trie data structure for efficient string lookups.
///
/// The trie must be built from a keyset before it can be used for lookups.
pub struct Trie {
inner: *mut MarisaTrie,
}
impl Trie {
/// Creates a new empty trie.
pub fn new() -> Self {
unsafe {
let inner = marisa_trie_new();
Trie { inner }
}
}
/// Builds the trie from the given keyset.
///
/// # Errors
///
/// Returns an error if the trie cannot be built from the keyset.
pub fn build(&mut self, keyset: &mut Keyset) -> Result<(), &'static str> {
unsafe {
if marisa_trie_build(self.inner, keyset.inner) == 1 {
Ok(())
} else {
Err("Failed to build trie")
}
}
}
/// Looks up a key in the trie and returns its ID if found.
///
/// # Returns
///
/// - `Some(id)` if the key is found in the trie
/// - `None` if the key is not found
pub fn lookup(&self, key: &str) -> Option<usize> {
let mut agent = Agent::new();
agent.set_query(key);
unsafe {
if marisa_trie_lookup(self.inner, agent.inner) == 1 {
Some(agent.key_id())
} else {
None
}
}
}
/// Performs reverse lookup to get the key corresponding to the given ID.
///
/// # Errors
///
/// Returns an error if the ID is not valid.
pub fn reverse_lookup(&self, id: usize) -> Result<String, &'static str> {
let mut agent = Agent::new();
agent.set_query_by_id(id);
unsafe {
if marisa_trie_reverse_lookup(self.inner, agent.inner) == 1 {
Ok(agent.key_string())
} else {
Err("Failed to reverse lookup")
}
}
}
/// Searches for all keys that are prefixes of the given query.
///
/// The callback function is called for each matching key with the key and its ID.
pub fn common_prefix_search<F>(&self, query: &str, mut callback: F)
where
F: FnMut(&str, usize),
{
let mut agent = Agent::new();
agent.set_query(query);
unsafe {
while marisa_trie_common_prefix_search(self.inner, agent.inner) == 1 {
let key = agent.key_string();
let id = agent.key_id();
callback(&key, id);
}
}
}
/// Searches for all keys that have the given query as a prefix.
///
/// The callback function is called for each matching key with the key and its ID.
pub fn predictive_search<F>(&self, query: &str, mut callback: F)
where
F: FnMut(&str, usize),
{
let mut agent = Agent::new();
agent.set_query(query);
unsafe {
while marisa_trie_predictive_search(self.inner, agent.inner) == 1 {
let key = agent.key_string();
let id = agent.key_id();
callback(&key, id);
}
}
}
/// Returns the number of keys stored in the trie.
pub fn size(&self) -> usize {
unsafe { marisa_trie_size(self.inner) }
}
/// Returns true if the trie is empty.
pub fn is_empty(&self) -> bool {
self.size() == 0
}
}
impl Drop for Trie {
fn drop(&mut self) {
unsafe {
marisa_trie_delete(self.inner);
}
}
}
/// An agent for performing trie operations.
///
/// Agent is used internally for trie operations and should not be used directly
/// in most cases.
pub struct Agent {
inner: *mut MarisaAgent,
}
impl Agent {
pub fn new() -> Self {
unsafe {
let inner = marisa_agent_new();
Agent { inner }
}
}
pub fn set_query(&mut self, query: &str) {
let query_bytes = query.as_bytes();
unsafe {
marisa_agent_set_query(
self.inner,
query_bytes.as_ptr() as *const i8,
query_bytes.len(),
);
}
}
pub fn set_query_by_id(&mut self, id: usize) {
unsafe {
marisa_agent_set_query_by_id(self.inner, id);
}
}
pub fn key_string(&self) -> String {
unsafe {
let ptr = marisa_agent_key_ptr(self.inner);
let len = marisa_agent_key_length(self.inner);
let slice = slice::from_raw_parts(ptr as *const u8, len);
String::from_utf8_lossy(slice).into_owned()
}
}
pub fn key_id(&self) -> usize {
unsafe { marisa_agent_key_id(self.inner) }
}
}
impl Drop for Agent {
fn drop(&mut self) {
unsafe {
marisa_agent_delete(self.inner);
}
}
}
unsafe impl Send for Keyset {}
unsafe impl Send for Trie {}
unsafe impl Send for Agent {}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_operations() {
let mut keyset = Keyset::new();
keyset.push("apple");
keyset.push("application");
keyset.push("apply");
keyset.push("apricot");
assert_eq!(keyset.size(), 4);
let mut trie = Trie::new();
trie.build(&mut keyset).expect("Failed to build trie");
assert_eq!(trie.size(), 4);
// Test lookup
assert!(trie.lookup("apple").is_some());
assert!(trie.lookup("banana").is_none());
// Test reverse lookup
if let Some(id) = trie.lookup("apple") {
assert_eq!(trie.reverse_lookup(id).unwrap(), "apple");
}
// Test common prefix search
let mut results = Vec::new();
trie.common_prefix_search("application", |key, id| {
results.push((key.to_string(), id));
});
assert!(results.len() > 0);
// Test predictive search
let mut results = Vec::new();
trie.predictive_search("app", |key, id| {
results.push((key.to_string(), id));
});
assert!(results.len() > 0);
}
#[test]
fn test_empty_keyset() {
let keyset = Keyset::new();
assert!(keyset.is_empty());
assert_eq!(keyset.size(), 0);
}
#[test]
fn test_empty_trie() {
let trie = Trie::new();
assert!(trie.is_empty());
assert_eq!(trie.size(), 0);
}
}

50
src/main.rs Normal file
View file

@ -0,0 +1,50 @@
use marisa_rs::{Keyset, Trie};
fn main() {
println!("Marisa Trie Demo");
println!("================");
let mut keyset = Keyset::new();
let words = vec!["apple", "application", "apply", "apricot"];
println!("Adding words to keyset:");
for word in &words {
keyset.push(word);
println!(" - {}", word);
}
let mut trie = Trie::new();
match trie.build(&mut keyset) {
Ok(()) => println!("\nTrie built successfully! Size: {}", trie.size()),
Err(e) => {
println!("Failed to build trie: {}", e);
return;
}
}
println!("\n--- Lookup Test ---");
for word in &words {
match trie.lookup(word) {
Some(id) => println!("'{}' found with ID: {}", word, id),
None => println!("'{}' not found", word),
}
}
println!("\n--- Reverse Lookup Test ---");
for i in 0..trie.size() {
match trie.reverse_lookup(i) {
Ok(word) => println!("ID {} -> '{}'", i, word),
Err(e) => println!("ID {}: {}", i, e),
}
}
println!("\n--- Common Prefix Search for 'application' ---");
trie.common_prefix_search("application", |key, id| {
println!(" Found: '{}' (ID: {})", key, id);
});
println!("\n--- Predictive Search for 'app' ---");
trie.predictive_search("app", |key, id| {
println!(" Found: '{}' (ID: {})", key, id);
});
}

123
wrapper.cpp Normal file
View file

@ -0,0 +1,123 @@
#include "wrapper.h"
#include <marisa.h>
#include <string>
extern "C" {
MarisaKeyset* marisa_keyset_new() {
return reinterpret_cast<MarisaKeyset*>(new marisa::Keyset());
}
void marisa_keyset_delete(MarisaKeyset* keyset) {
delete reinterpret_cast<marisa::Keyset*>(keyset);
}
void marisa_keyset_push_back(MarisaKeyset* keyset, const char* key, size_t length, float weight) {
marisa::Keyset* ks = reinterpret_cast<marisa::Keyset*>(keyset);
ks->push_back(std::string(key, length), weight);
}
size_t marisa_keyset_size(const MarisaKeyset* keyset) {
const marisa::Keyset* ks = reinterpret_cast<const marisa::Keyset*>(keyset);
return ks->size();
}
MarisaTrie* marisa_trie_new() {
return reinterpret_cast<MarisaTrie*>(new marisa::Trie());
}
void marisa_trie_delete(MarisaTrie* trie) {
delete reinterpret_cast<marisa::Trie*>(trie);
}
int marisa_trie_build(MarisaTrie* trie, MarisaKeyset* keyset) {
try {
marisa::Trie* tr = reinterpret_cast<marisa::Trie*>(trie);
marisa::Keyset* ks = reinterpret_cast<marisa::Keyset*>(keyset);
tr->build(*ks);
return 1;
} catch (...) {
return 0;
}
}
int marisa_trie_lookup(const MarisaTrie* trie, MarisaAgent* agent) {
try {
const marisa::Trie* tr = reinterpret_cast<const marisa::Trie*>(trie);
marisa::Agent* ag = reinterpret_cast<marisa::Agent*>(agent);
return tr->lookup(*ag) ? 1 : 0;
} catch (...) {
return 0;
}
}
int marisa_trie_reverse_lookup(const MarisaTrie* trie, MarisaAgent* agent) {
try {
const marisa::Trie* tr = reinterpret_cast<const marisa::Trie*>(trie);
marisa::Agent* ag = reinterpret_cast<marisa::Agent*>(agent);
tr->reverse_lookup(*ag);
return 1;
} catch (...) {
return 0;
}
}
int marisa_trie_common_prefix_search(const MarisaTrie* trie, MarisaAgent* agent) {
try {
const marisa::Trie* tr = reinterpret_cast<const marisa::Trie*>(trie);
marisa::Agent* ag = reinterpret_cast<marisa::Agent*>(agent);
return tr->common_prefix_search(*ag) ? 1 : 0;
} catch (...) {
return 0;
}
}
int marisa_trie_predictive_search(const MarisaTrie* trie, MarisaAgent* agent) {
try {
const marisa::Trie* tr = reinterpret_cast<const marisa::Trie*>(trie);
marisa::Agent* ag = reinterpret_cast<marisa::Agent*>(agent);
return tr->predictive_search(*ag) ? 1 : 0;
} catch (...) {
return 0;
}
}
size_t marisa_trie_size(const MarisaTrie* trie) {
const marisa::Trie* tr = reinterpret_cast<const marisa::Trie*>(trie);
return tr->size();
}
MarisaAgent* marisa_agent_new() {
return reinterpret_cast<MarisaAgent*>(new marisa::Agent());
}
void marisa_agent_delete(MarisaAgent* agent) {
delete reinterpret_cast<marisa::Agent*>(agent);
}
void marisa_agent_set_query(MarisaAgent* agent, const char* query, size_t length) {
marisa::Agent* ag = reinterpret_cast<marisa::Agent*>(agent);
ag->set_query(query, length);
}
void marisa_agent_set_query_by_id(MarisaAgent* agent, size_t id) {
marisa::Agent* ag = reinterpret_cast<marisa::Agent*>(agent);
ag->set_query(id);
}
const char* marisa_agent_key_ptr(const MarisaAgent* agent) {
const marisa::Agent* ag = reinterpret_cast<const marisa::Agent*>(agent);
return ag->key().str().data();
}
size_t marisa_agent_key_length(const MarisaAgent* agent) {
const marisa::Agent* ag = reinterpret_cast<const marisa::Agent*>(agent);
return ag->key().str().length();
}
size_t marisa_agent_key_id(const MarisaAgent* agent) {
const marisa::Agent* ag = reinterpret_cast<const marisa::Agent*>(agent);
return ag->key().id();
}
}

41
wrapper.h Normal file
View file

@ -0,0 +1,41 @@
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
#include <stddef.h>
#include <stdint.h>
typedef struct MarisaTrie MarisaTrie;
typedef struct MarisaKeyset MarisaKeyset;
typedef struct MarisaAgent MarisaAgent;
// Keyset functions
MarisaKeyset* marisa_keyset_new();
void marisa_keyset_delete(MarisaKeyset* keyset);
void marisa_keyset_push_back(MarisaKeyset* keyset, const char* key, size_t length, float weight);
size_t marisa_keyset_size(const MarisaKeyset* keyset);
// Trie functions
MarisaTrie* marisa_trie_new();
void marisa_trie_delete(MarisaTrie* trie);
int marisa_trie_build(MarisaTrie* trie, MarisaKeyset* keyset);
int marisa_trie_lookup(const MarisaTrie* trie, MarisaAgent* agent);
int marisa_trie_reverse_lookup(const MarisaTrie* trie, MarisaAgent* agent);
int marisa_trie_common_prefix_search(const MarisaTrie* trie, MarisaAgent* agent);
int marisa_trie_predictive_search(const MarisaTrie* trie, MarisaAgent* agent);
size_t marisa_trie_size(const MarisaTrie* trie);
// Agent functions
MarisaAgent* marisa_agent_new();
void marisa_agent_delete(MarisaAgent* agent);
void marisa_agent_set_query(MarisaAgent* agent, const char* query, size_t length);
void marisa_agent_set_query_by_id(MarisaAgent* agent, size_t id);
const char* marisa_agent_key_ptr(const MarisaAgent* agent);
size_t marisa_agent_key_length(const MarisaAgent* agent);
size_t marisa_agent_key_id(const MarisaAgent* agent);
#ifdef __cplusplus
}
#endif