commit c8b3c3c44f8820dcdc1703ef2685b7ea17b089ae Author: Soma Nakamura Date: Thu Jul 24 02:24:10 2025 +0900 initial diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ed768f3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target/ +Cargo.lock \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..6d05c54 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,457 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "bindgen" +version = "0.69.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "itertools", + "lazy_static", + "lazycell", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn", + "which", +] + +[[package]] +name = "bitflags" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" + +[[package]] +name = "cc" +version = "1.2.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deec109607ca693028562ed836a5f1c4b8bd77755c4e132fc5ce11b0b6211ae7" +dependencies = [ + "shlex", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" + +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "errno" +version = "0.3.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" +dependencies = [ + "libc", + "windows-sys 0.60.2", +] + +[[package]] +name = "glob" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" + +[[package]] +name = "home" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "libc" +version = "0.2.174" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" + +[[package]] +name = "libloading" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" +dependencies = [ + "cfg-if", + "windows-targets 0.53.2", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "marisa-rs" +version = "0.1.0" +dependencies = [ + "bindgen", + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "memchr" +version = "2.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "prettyplease" +version = "0.2.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "061c1221631e079b26479d25bbf2275bfe5917ae8419cd7e34f13bfc2aa7539a" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.59.0", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "syn" +version = "2.0.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.2", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..f166ced --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "marisa-rs" +version = "0.1.0" +edition = "2021" +description = "Safe Rust wrapper for the marisa-trie C++ library - a static memory-efficient trie data structure. Requires marisa-trie system library." +license = "MIT OR Apache-2.0" +readme = "README.md" +homepage = "https://crates.io/crates/marisa-rs" +keywords = ["trie", "string", "search", "marisa"] +categories = ["data-structures", "text-processing"] + +[lib] +name = "marisa_rs" +crate-type = ["cdylib", "rlib"] + +[dependencies] +libc = "0.2" + +[build-dependencies] +cc = "1.0" +pkg-config = "0.3" +bindgen = "0.69" diff --git a/README.md b/README.md new file mode 100644 index 0000000..1f906bb --- /dev/null +++ b/README.md @@ -0,0 +1,178 @@ +# marisa-rs + +Safe Rust wrapper for the [marisa-trie](https://github.com/s-yata/marisa-trie) C++ library. + +marisa-trie is a static and space-efficient trie data structure library. This crate provides safe Rust bindings to the C++ library. + +## Installation + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +marisa-rs = "0.1" +``` + +## Requirements + +This crate requires the marisa-trie C++ library to be installed on your system. + +### Ubuntu/Debian +```bash +sudo apt-get install libmarisa-dev +``` + +### macOS +```bash +brew install marisa-trie +``` + +## Quick Start + +```rust +use marisa_rs::{Keyset, Trie}; + +fn main() { + // Create a keyset and add words + let mut keyset = Keyset::new(); + keyset.push("apple"); + keyset.push("application"); + keyset.push("apply"); + + // Build the trie + let mut trie = Trie::new(); + trie.build(&mut keyset).unwrap(); + + // Lookup a word + if let Some(id) = trie.lookup("apple") { + println!("Found 'apple' with ID: {}", id); + } + + // Search for words starting with "app" + trie.predictive_search("app", |word, id| { + println!("Found: {} (ID: {})", word, id); + }); +} +``` + +## Basic Usage + +### Creating and Building a Trie + +```rust +use marisa_rs::{Keyset, Trie}; + +// Create a keyset +let mut keyset = Keyset::new(); + +// Add words to the keyset +keyset.push("cat"); +keyset.push("car"); +keyset.push("card"); +keyset.push("care"); + +// Build the trie +let mut trie = Trie::new(); +trie.build(&mut keyset)?; +``` + +### Lookup Operations + +```rust +// Exact lookup +match trie.lookup("car") { + Some(id) => println!("Found with ID: {}", id), + None => println!("Not found"), +} + +// Reverse lookup (get word by ID) +match trie.reverse_lookup(0) { + Ok(word) => println!("ID 0 corresponds to: {}", word), + Err(_) => println!("Invalid ID"), +} +``` + +### Search Operations + +```rust +// Find all prefixes of a word +trie.common_prefix_search("cards", |word, id| { + println!("Prefix: {} (ID: {})", word, id); + // Output: "car", "card" +}); + +// Find all words starting with a prefix +trie.predictive_search("car", |word, id| { + println!("Word: {} (ID: {})", word, id); + // Output: "car", "card", "care" +}); +``` + +### Working with Weights + +```rust +let mut keyset = Keyset::new(); + +// Add words with custom weights +keyset.push_back("important", 10.0); +keyset.push_back("normal", 1.0); +keyset.push_back("less_important", 0.1); + +let mut trie = Trie::new(); +trie.build(&mut keyset)?; +``` + +## API Reference + +### Keyset + +- `Keyset::new()` - Create a new empty keyset +- `keyset.push(key)` - Add a key with default weight (1.0) +- `keyset.push_back(key, weight)` - Add a key with specified weight +- `keyset.size()` - Get the number of keys +- `keyset.is_empty()` - Check if the keyset is empty + +### Trie + +- `Trie::new()` - Create a new empty trie +- `trie.build(&mut keyset)` - Build the trie from a keyset +- `trie.lookup(key)` - Find the ID of a key (returns `Option`) +- `trie.reverse_lookup(id)` - Find the key for an ID (returns `Result`) +- `trie.common_prefix_search(query, callback)` - Find all keys that are prefixes of query +- `trie.predictive_search(query, callback)` - Find all keys that start with query +- `trie.size()` - Get the number of keys in the trie +- `trie.is_empty()` - Check if the trie is empty + +## Japanese Text Example + +```rust +use marisa_rs::{Keyset, Trie}; + +let mut keyset = Keyset::new(); +keyset.push("あ"); // a +keyset.push("あい"); // ai (love) +keyset.push("あいて"); // aite (partner) + +let mut trie = Trie::new(); +trie.build(&mut keyset).unwrap(); + +// Works with UTF-8 strings +if let Some(id) = trie.lookup("あい") { + println!("Found Japanese word with ID: {}", id); +} +``` + +## Thread Safety + +All types (`Keyset`, `Trie`, `Agent`) implement `Send` and can be transferred between threads. However, they are not `Sync` and cannot be shared between threads without additional synchronization. + +## License + +This project is licensed under either of + + * Apache License, Version 2.0 + * MIT license + +at your option. + +This crate is built on top of the excellent [marisa-trie](https://github.com/s-yata/marisa-trie) library by Susumu Yata. \ No newline at end of file diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..e5e226a --- /dev/null +++ b/build.rs @@ -0,0 +1,54 @@ +use std::env; +use std::path::PathBuf; + +fn main() { + println!("cargo:rerun-if-changed=wrapper.cpp"); + println!("cargo:rerun-if-changed=wrapper.h"); + + let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); + + if pkg_config::Config::new() + .atleast_version("0.2.0") + .probe("marisa") + .is_ok() + { + println!("cargo:rustc-link-lib=static=marisa"); + } else { + println!("cargo:rustc-link-lib=static=marisa"); + println!("cargo:rustc-link-search=native=/usr/local/lib"); + println!("cargo:rustc-link-search=native=/usr/lib"); + } + + let mut build = cc::Build::new(); + build + .cpp(true) + .file("wrapper.cpp") + .flag_if_supported("-std=c++17"); + + if let Ok(cpath) = env::var("CPATH") { + for path in cpath.split(':') { + if !path.is_empty() { + build.include(path); + } + } + } + + build + .include("/usr/local/include") + .include("/usr/include"); + + build.compile("marisa_wrapper"); + + let bindings = bindgen::Builder::default() + .header("wrapper.h") + .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) + .rust_target(bindgen::RustTarget::Stable_1_47) + .generate_inline_functions(false) + .generate_comments(false) + .generate() + .expect("Unable to generate bindings"); + + bindings + .write_to_file(out_path.join("bindings.rs")) + .expect("Couldn't write bindings!"); +} \ No newline at end of file diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..fd3bdc5 --- /dev/null +++ b/flake.lock @@ -0,0 +1,96 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1752950548, + "narHash": "sha256-NS6BLD0lxOrnCiEOcvQCDVPXafX1/ek1dfJHX1nUIzc=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "c87b95e25065c028d31a94f06a62927d18763fdf", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs_2": { + "locked": { + "lastModified": 1744536153, + "narHash": "sha256-awS2zRgF4uTwrOKwwiJcByDzDOdo3Q1rPZbiHQg/N38=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "18dd725c29603f582cf1900e0d25f9f1063dbf11", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs", + "rust-overlay": "rust-overlay" + } + }, + "rust-overlay": { + "inputs": { + "nixpkgs": "nixpkgs_2" + }, + "locked": { + "lastModified": 1753238793, + "narHash": "sha256-jmQeEpgX+++MEgrcikcwoSiI7vDZWLP0gci7XiWb9uQ=", + "owner": "oxalica", + "repo": "rust-overlay", + "rev": "0ad7ab4ca8e83febf147197e65c006dff60623ab", + "type": "github" + }, + "original": { + "owner": "oxalica", + "repo": "rust-overlay", + "type": "github" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..1562d9c --- /dev/null +++ b/flake.nix @@ -0,0 +1,85 @@ +{ + description = "Rust wrapper for marisa-trie C++ library"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + rust-overlay.url = "github:oxalica/rust-overlay"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, rust-overlay, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + overlays = [ (import rust-overlay) ]; + pkgs = import nixpkgs { + inherit system overlays; + }; + + rustToolchain = pkgs.rust-bin.stable.latest.default.override { + extensions = [ "rust-src" "clippy" "rustfmt" ]; + }; + + # Use nixpkgs marisa-trie package + marisa-trie = pkgs.marisa; + + in + { + devShells.default = pkgs.mkShell { + buildInputs = with pkgs; [ + rustToolchain + marisa-trie + + # Build tools + cmake + gcc + pkg-config + clang + llvmPackages.libclang.lib + + # Development tools + rust-analyzer + clippy + rustfmt + + # C++ development + gdb + valgrind + ]; + + shellHook = '' + echo "Rust marisa-trie development environment" + echo "Rust version: $(rustc --version)" + echo "Cargo version: $(cargo --version)" + echo "marisa-trie library available at: ${marisa-trie}" + + export PKG_CONFIG_PATH="${marisa-trie}/lib/pkgconfig:$PKG_CONFIG_PATH" + export LD_LIBRARY_PATH="${marisa-trie}/lib:$LD_LIBRARY_PATH" + export LIBRARY_PATH="${marisa-trie}/lib:$LIBRARY_PATH" + export CPATH="${marisa-trie}/include:$CPATH" + export LIBCLANG_PATH="${pkgs.llvmPackages.libclang.lib}/lib" + ''; + + RUST_SRC_PATH = "${rustToolchain}/lib/rustlib/src/rust/library"; + }; + + packages.default = pkgs.rustPlatform.buildRustPackage { + pname = "marisa-rs"; + version = "0.1.0"; + + src = ./.; + + cargoLock = { + lockFile = ./Cargo.lock; + }; + + buildInputs = [ marisa-trie ]; + nativeBuildInputs = with pkgs; [ cmake gcc pkg-config ]; + + meta = with pkgs.lib; { + description = "Rust wrapper for marisa-trie C++ library"; + license = with licenses; [ mit asl20 ]; + platforms = platforms.unix; + }; + }; + }); +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..4b1a656 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,333 @@ +//! # marisa-rs +//! +//! Safe Rust wrapper for the marisa-trie C++ library. +//! +//! marisa-trie is a static and space-efficient trie data structure library. +//! This crate provides safe Rust bindings to the C++ library. +//! +//! ## Example +//! +//! ```rust +//! use marisa_rs::{Keyset, Trie}; +//! +//! let mut keyset = Keyset::new(); +//! keyset.push("apple"); +//! keyset.push("application"); +//! keyset.push("apply"); +//! +//! let mut trie = Trie::new(); +//! trie.build(&mut keyset).unwrap(); +//! +//! // Lookup +//! assert!(trie.lookup("apple").is_some()); +//! assert!(trie.lookup("orange").is_none()); +//! +//! // Common prefix search +//! trie.common_prefix_search("application", |key, id| { +//! println!("Found: {} (ID: {})", key, id); +//! }); +//! ``` + +use std::slice; + +mod bindings { + include!(concat!(env!("OUT_DIR"), "/bindings.rs")); +} + +use bindings::*; + +/// A keyset for building a trie. +/// +/// Keyset is used to store a collection of keys before building a trie. +/// Keys can be added with different weights. +pub struct Keyset { + inner: *mut MarisaKeyset, +} + +impl Keyset { + /// Creates a new empty keyset. + pub fn new() -> Self { + unsafe { + let inner = marisa_keyset_new(); + Keyset { inner } + } + } + + /// Adds a key with the specified weight to the keyset. + pub fn push_back(&mut self, key: &str, weight: f32) { + let key_bytes = key.as_bytes(); + unsafe { + marisa_keyset_push_back( + self.inner, + key_bytes.as_ptr() as *const i8, + key_bytes.len(), + weight, + ); + } + } + + /// Adds a key with default weight (1.0) to the keyset. + pub fn push(&mut self, key: &str) { + self.push_back(key, 1.0); + } + + /// Returns the number of keys in the keyset. + pub fn size(&self) -> usize { + unsafe { marisa_keyset_size(self.inner) } + } + + /// Returns true if the keyset is empty. + pub fn is_empty(&self) -> bool { + self.size() == 0 + } +} + +impl Drop for Keyset { + fn drop(&mut self) { + unsafe { + marisa_keyset_delete(self.inner); + } + } +} + +/// A trie data structure for efficient string lookups. +/// +/// The trie must be built from a keyset before it can be used for lookups. +pub struct Trie { + inner: *mut MarisaTrie, +} + +impl Trie { + /// Creates a new empty trie. + pub fn new() -> Self { + unsafe { + let inner = marisa_trie_new(); + Trie { inner } + } + } + + /// Builds the trie from the given keyset. + /// + /// # Errors + /// + /// Returns an error if the trie cannot be built from the keyset. + pub fn build(&mut self, keyset: &mut Keyset) -> Result<(), &'static str> { + unsafe { + if marisa_trie_build(self.inner, keyset.inner) == 1 { + Ok(()) + } else { + Err("Failed to build trie") + } + } + } + + /// Looks up a key in the trie and returns its ID if found. + /// + /// # Returns + /// + /// - `Some(id)` if the key is found in the trie + /// - `None` if the key is not found + pub fn lookup(&self, key: &str) -> Option { + let mut agent = Agent::new(); + agent.set_query(key); + + unsafe { + if marisa_trie_lookup(self.inner, agent.inner) == 1 { + Some(agent.key_id()) + } else { + None + } + } + } + + /// Performs reverse lookup to get the key corresponding to the given ID. + /// + /// # Errors + /// + /// Returns an error if the ID is not valid. + pub fn reverse_lookup(&self, id: usize) -> Result { + let mut agent = Agent::new(); + agent.set_query_by_id(id); + + unsafe { + if marisa_trie_reverse_lookup(self.inner, agent.inner) == 1 { + Ok(agent.key_string()) + } else { + Err("Failed to reverse lookup") + } + } + } + + /// Searches for all keys that are prefixes of the given query. + /// + /// The callback function is called for each matching key with the key and its ID. + pub fn common_prefix_search(&self, query: &str, mut callback: F) + where + F: FnMut(&str, usize), + { + let mut agent = Agent::new(); + agent.set_query(query); + + unsafe { + while marisa_trie_common_prefix_search(self.inner, agent.inner) == 1 { + let key = agent.key_string(); + let id = agent.key_id(); + callback(&key, id); + } + } + } + + /// Searches for all keys that have the given query as a prefix. + /// + /// The callback function is called for each matching key with the key and its ID. + pub fn predictive_search(&self, query: &str, mut callback: F) + where + F: FnMut(&str, usize), + { + let mut agent = Agent::new(); + agent.set_query(query); + + unsafe { + while marisa_trie_predictive_search(self.inner, agent.inner) == 1 { + let key = agent.key_string(); + let id = agent.key_id(); + callback(&key, id); + } + } + } + + /// Returns the number of keys stored in the trie. + pub fn size(&self) -> usize { + unsafe { marisa_trie_size(self.inner) } + } + + /// Returns true if the trie is empty. + pub fn is_empty(&self) -> bool { + self.size() == 0 + } +} + +impl Drop for Trie { + fn drop(&mut self) { + unsafe { + marisa_trie_delete(self.inner); + } + } +} + +/// An agent for performing trie operations. +/// +/// Agent is used internally for trie operations and should not be used directly +/// in most cases. +pub struct Agent { + inner: *mut MarisaAgent, +} + +impl Agent { + pub fn new() -> Self { + unsafe { + let inner = marisa_agent_new(); + Agent { inner } + } + } + + pub fn set_query(&mut self, query: &str) { + let query_bytes = query.as_bytes(); + unsafe { + marisa_agent_set_query( + self.inner, + query_bytes.as_ptr() as *const i8, + query_bytes.len(), + ); + } + } + + pub fn set_query_by_id(&mut self, id: usize) { + unsafe { + marisa_agent_set_query_by_id(self.inner, id); + } + } + + pub fn key_string(&self) -> String { + unsafe { + let ptr = marisa_agent_key_ptr(self.inner); + let len = marisa_agent_key_length(self.inner); + let slice = slice::from_raw_parts(ptr as *const u8, len); + String::from_utf8_lossy(slice).into_owned() + } + } + + pub fn key_id(&self) -> usize { + unsafe { marisa_agent_key_id(self.inner) } + } +} + +impl Drop for Agent { + fn drop(&mut self) { + unsafe { + marisa_agent_delete(self.inner); + } + } +} + +unsafe impl Send for Keyset {} +unsafe impl Send for Trie {} +unsafe impl Send for Agent {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_operations() { + let mut keyset = Keyset::new(); + keyset.push("apple"); + keyset.push("application"); + keyset.push("apply"); + keyset.push("apricot"); + + assert_eq!(keyset.size(), 4); + + let mut trie = Trie::new(); + trie.build(&mut keyset).expect("Failed to build trie"); + + assert_eq!(trie.size(), 4); + + // Test lookup + assert!(trie.lookup("apple").is_some()); + assert!(trie.lookup("banana").is_none()); + + // Test reverse lookup + if let Some(id) = trie.lookup("apple") { + assert_eq!(trie.reverse_lookup(id).unwrap(), "apple"); + } + + // Test common prefix search + let mut results = Vec::new(); + trie.common_prefix_search("application", |key, id| { + results.push((key.to_string(), id)); + }); + assert!(results.len() > 0); + + // Test predictive search + let mut results = Vec::new(); + trie.predictive_search("app", |key, id| { + results.push((key.to_string(), id)); + }); + assert!(results.len() > 0); + } + + #[test] + fn test_empty_keyset() { + let keyset = Keyset::new(); + assert!(keyset.is_empty()); + assert_eq!(keyset.size(), 0); + } + + #[test] + fn test_empty_trie() { + let trie = Trie::new(); + assert!(trie.is_empty()); + assert_eq!(trie.size(), 0); + } +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..4bffc74 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,50 @@ +use marisa_rs::{Keyset, Trie}; + +fn main() { + println!("Marisa Trie Demo"); + println!("================"); + + let mut keyset = Keyset::new(); + let words = vec!["apple", "application", "apply", "apricot"]; + + println!("Adding words to keyset:"); + for word in &words { + keyset.push(word); + println!(" - {}", word); + } + + let mut trie = Trie::new(); + match trie.build(&mut keyset) { + Ok(()) => println!("\nTrie built successfully! Size: {}", trie.size()), + Err(e) => { + println!("Failed to build trie: {}", e); + return; + } + } + + println!("\n--- Lookup Test ---"); + for word in &words { + match trie.lookup(word) { + Some(id) => println!("'{}' found with ID: {}", word, id), + None => println!("'{}' not found", word), + } + } + + println!("\n--- Reverse Lookup Test ---"); + for i in 0..trie.size() { + match trie.reverse_lookup(i) { + Ok(word) => println!("ID {} -> '{}'", i, word), + Err(e) => println!("ID {}: {}", i, e), + } + } + + println!("\n--- Common Prefix Search for 'application' ---"); + trie.common_prefix_search("application", |key, id| { + println!(" Found: '{}' (ID: {})", key, id); + }); + + println!("\n--- Predictive Search for 'app' ---"); + trie.predictive_search("app", |key, id| { + println!(" Found: '{}' (ID: {})", key, id); + }); +} diff --git a/wrapper.cpp b/wrapper.cpp new file mode 100644 index 0000000..7fea9d2 --- /dev/null +++ b/wrapper.cpp @@ -0,0 +1,123 @@ +#include "wrapper.h" +#include +#include + +extern "C" { + +MarisaKeyset* marisa_keyset_new() { + return reinterpret_cast(new marisa::Keyset()); +} + +void marisa_keyset_delete(MarisaKeyset* keyset) { + delete reinterpret_cast(keyset); +} + +void marisa_keyset_push_back(MarisaKeyset* keyset, const char* key, size_t length, float weight) { + marisa::Keyset* ks = reinterpret_cast(keyset); + ks->push_back(std::string(key, length), weight); +} + +size_t marisa_keyset_size(const MarisaKeyset* keyset) { + const marisa::Keyset* ks = reinterpret_cast(keyset); + return ks->size(); +} + +MarisaTrie* marisa_trie_new() { + return reinterpret_cast(new marisa::Trie()); +} + +void marisa_trie_delete(MarisaTrie* trie) { + delete reinterpret_cast(trie); +} + +int marisa_trie_build(MarisaTrie* trie, MarisaKeyset* keyset) { + try { + marisa::Trie* tr = reinterpret_cast(trie); + marisa::Keyset* ks = reinterpret_cast(keyset); + tr->build(*ks); + return 1; + } catch (...) { + return 0; + } +} + +int marisa_trie_lookup(const MarisaTrie* trie, MarisaAgent* agent) { + try { + const marisa::Trie* tr = reinterpret_cast(trie); + marisa::Agent* ag = reinterpret_cast(agent); + return tr->lookup(*ag) ? 1 : 0; + } catch (...) { + return 0; + } +} + +int marisa_trie_reverse_lookup(const MarisaTrie* trie, MarisaAgent* agent) { + try { + const marisa::Trie* tr = reinterpret_cast(trie); + marisa::Agent* ag = reinterpret_cast(agent); + tr->reverse_lookup(*ag); + return 1; + } catch (...) { + return 0; + } +} + +int marisa_trie_common_prefix_search(const MarisaTrie* trie, MarisaAgent* agent) { + try { + const marisa::Trie* tr = reinterpret_cast(trie); + marisa::Agent* ag = reinterpret_cast(agent); + return tr->common_prefix_search(*ag) ? 1 : 0; + } catch (...) { + return 0; + } +} + +int marisa_trie_predictive_search(const MarisaTrie* trie, MarisaAgent* agent) { + try { + const marisa::Trie* tr = reinterpret_cast(trie); + marisa::Agent* ag = reinterpret_cast(agent); + return tr->predictive_search(*ag) ? 1 : 0; + } catch (...) { + return 0; + } +} + +size_t marisa_trie_size(const MarisaTrie* trie) { + const marisa::Trie* tr = reinterpret_cast(trie); + return tr->size(); +} + +MarisaAgent* marisa_agent_new() { + return reinterpret_cast(new marisa::Agent()); +} + +void marisa_agent_delete(MarisaAgent* agent) { + delete reinterpret_cast(agent); +} + +void marisa_agent_set_query(MarisaAgent* agent, const char* query, size_t length) { + marisa::Agent* ag = reinterpret_cast(agent); + ag->set_query(query, length); +} + +void marisa_agent_set_query_by_id(MarisaAgent* agent, size_t id) { + marisa::Agent* ag = reinterpret_cast(agent); + ag->set_query(id); +} + +const char* marisa_agent_key_ptr(const MarisaAgent* agent) { + const marisa::Agent* ag = reinterpret_cast(agent); + return ag->key().str().data(); +} + +size_t marisa_agent_key_length(const MarisaAgent* agent) { + const marisa::Agent* ag = reinterpret_cast(agent); + return ag->key().str().length(); +} + +size_t marisa_agent_key_id(const MarisaAgent* agent) { + const marisa::Agent* ag = reinterpret_cast(agent); + return ag->key().id(); +} + +} \ No newline at end of file diff --git a/wrapper.h b/wrapper.h new file mode 100644 index 0000000..9289af5 --- /dev/null +++ b/wrapper.h @@ -0,0 +1,41 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +typedef struct MarisaTrie MarisaTrie; +typedef struct MarisaKeyset MarisaKeyset; +typedef struct MarisaAgent MarisaAgent; + +// Keyset functions +MarisaKeyset* marisa_keyset_new(); +void marisa_keyset_delete(MarisaKeyset* keyset); +void marisa_keyset_push_back(MarisaKeyset* keyset, const char* key, size_t length, float weight); +size_t marisa_keyset_size(const MarisaKeyset* keyset); + +// Trie functions +MarisaTrie* marisa_trie_new(); +void marisa_trie_delete(MarisaTrie* trie); +int marisa_trie_build(MarisaTrie* trie, MarisaKeyset* keyset); +int marisa_trie_lookup(const MarisaTrie* trie, MarisaAgent* agent); +int marisa_trie_reverse_lookup(const MarisaTrie* trie, MarisaAgent* agent); +int marisa_trie_common_prefix_search(const MarisaTrie* trie, MarisaAgent* agent); +int marisa_trie_predictive_search(const MarisaTrie* trie, MarisaAgent* agent); +size_t marisa_trie_size(const MarisaTrie* trie); + +// Agent functions +MarisaAgent* marisa_agent_new(); +void marisa_agent_delete(MarisaAgent* agent); +void marisa_agent_set_query(MarisaAgent* agent, const char* query, size_t length); +void marisa_agent_set_query_by_id(MarisaAgent* agent, size_t id); +const char* marisa_agent_key_ptr(const MarisaAgent* agent); +size_t marisa_agent_key_length(const MarisaAgent* agent); +size_t marisa_agent_key_id(const MarisaAgent* agent); + +#ifdef __cplusplus +} +#endif \ No newline at end of file