From dadab5514e44bf1a03eb25c16d7e361e013233e0 Mon Sep 17 00:00:00 2001
From: Leo Galambos
Date: Sat, 16 May 2026 03:24:07 +0200
Subject: [PATCH] feat: implement dense-child optimized trie lookup and
enterprise test/CI profile hardening
---
.github/workflows/build.yml | 12 +-
.github/workflows/pages.yml | 19 +-
README.md | 3 +
build.gradle | 161 +++++-
docs/lookup-edge-optimization.md | 193 +++++++
docs/programmatic-loading-and-building.md | 37 ++
docs/programmatic-usage.md | 1 +
docs/quality-and-operations.md | 21 +
docs/quick-start.md | 30 +
docs/reports.md | 2 +-
docs/test-taxonomy-and-filtering.md | 216 ++++++++
gradle/maven-pom.gradle | 2 +-
mkdocs.yml | 2 +
.../org/egothor/stemmer/FrequencyTrie.java | 524 +++++++++++-------
.../stemmer/StemmerPatchTrieBinaryIO.java | 71 +++
.../stemmer/StemmerPatchTrieLoader.java | 55 +-
.../egothor/stemmer/trie/CompiledNode.java | 260 ++++++++-
.../stemmer/CompileIntegrationTest.java | 12 +-
.../java/org/egothor/stemmer/CompileTest.java | 5 +-
.../CompiledTrieArtifactRegressionTest.java | 3 +-
.../stemmer/DiacriticStripperTest.java | 3 +-
.../stemmer/FrequencyTrieBuildersTest.java | 2 +-
.../stemmer/FrequencyTrieProperties.java | 4 +-
.../egothor/stemmer/FrequencyTrieTest.java | 306 ++++++++++
.../FuzzStemmerAndTrieCompilationTest.java | 3 +-
.../PatchCommandEncoderProperties.java | 4 +-
.../stemmer/PatchCommandEncoderTest.java | 6 +-
.../stemmer/StemmerDictionaryParserTest.java | 1 +
.../StemmerKnowledgeExperimentTest.java | 2 +-
.../stemmer/StemmerPatchTrieBinaryIOTest.java | 153 ++++-
.../stemmer/StemmerPatchTrieLoaderTest.java | 71 ++-
.../stemmer/StemmerPatchTrieProperties.java | 5 +-
.../org/egothor/stemmer/TrieMetadataTest.java | 2 +
.../stemmer/WordTraversalDirectionTest.java | 2 +
.../stemmer/trie/ChildDescriptorTest.java | 2 +-
.../trie/CompiledNodeAndNodeDataTest.java | 135 ++++-
.../trie/DominantLocalDescriptorTest.java | 2 +-
.../stemmer/trie/LocalValueSummaryTest.java | 2 +-
.../egothor/stemmer/trie/MutableNodeTest.java | 2 +-
.../trie/RankedLocalDescriptorTest.java | 2 +-
.../egothor/stemmer/trie/ReducedNodeTest.java | 2 +-
.../stemmer/trie/ReductionContextTest.java | 2 +-
.../stemmer/trie/ReductionSignatureTest.java | 2 +-
.../trie/UnorderedLocalDescriptorTest.java | 2 +-
44 files changed, 2052 insertions(+), 294 deletions(-)
create mode 100644 docs/lookup-edge-optimization.md
create mode 100644 docs/test-taxonomy-and-filtering.md
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 9d6de74..eb5a5b6 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -51,7 +51,7 @@ jobs:
test -f gradle/verification-metadata.xml
- name: Execute build, tests, PMD, coverage, Javadoc, distribution packaging, and SBOM generation
- run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport distZip cyclonedxBom
+ run: ./gradlew --no-daemon clean ciRelease distZip pmdMain javadoc jacocoCiReleaseReport cyclonedxBom
- name: Upload SBOM
if: always()
@@ -70,8 +70,8 @@ jobs:
with:
name: test-reports
path: |
- build/reports/tests/test
- build/test-results/test
+ build/reports/tests
+ build/test-results
if-no-files-found: warn
retention-days: 14
@@ -90,8 +90,8 @@ jobs:
with:
name: coverage-reports
path: |
- build/reports/jacoco/test/html
- build/reports/jacoco/test/jacocoTestReport.xml
+ build/reports/jacoco/jacocoCiReleaseReport/html
+ build/reports/jacoco/jacocoCiReleaseReport/jacocoCiReleaseReport.xml
if-no-files-found: warn
retention-days: 14
@@ -160,7 +160,7 @@ jobs:
env:
SIGNING_KEY: ${{ secrets.SIGNING_KEY }}
SIGNING_PASSWORD: ${{ secrets.SIGNING_PASSWORD }}
- run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport cyclonedxBom centralBundle
+ run: ./gradlew --no-daemon clean ciRelease distZip pmdMain javadoc jacocoCiReleaseReport cyclonedxBom centralBundle
- name: Generate release changelog
shell: bash
diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml
index 513a1c6..f1447d5 100644
--- a/.github/workflows/pages.yml
+++ b/.github/workflows/pages.yml
@@ -70,7 +70,7 @@ jobs:
test -f gradle/verification-metadata.xml
- name: Build reports for publication
- run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport pitest jmh cyclonedxBom
+ run: ./gradlew --no-daemon clean ciRelease pmdMain javadoc jacocoCiReleaseReport pitest jmh cyclonedxBom
- name: Prepare gh-pages worktree
shell: bash
@@ -93,6 +93,9 @@ jobs:
run: |
set -euo pipefail
+ TEST_REPORT_DIR="build/reports/tests/ciRelease"
+ JACOCO_REPORT_DIR="build/reports/jacoco/jacocoCiReleaseReport"
+
SITE_DIR=".gh-pages"
RUN_DIR="${SITE_DIR}/builds/${GITHUB_RUN_NUMBER}"
RUN_METRICS_DIR="${RUN_DIR}/metrics"
@@ -106,14 +109,14 @@ jobs:
cp -R build/docs/javadoc "${RUN_DIR}/javadoc"
cp -R build/docs/javadoc "${LATEST_DIR}/javadoc"
- cp -R build/reports/tests/test "${RUN_DIR}/test"
- cp -R build/reports/tests/test "${LATEST_DIR}/test"
+ cp -R "${TEST_REPORT_DIR}" "${RUN_DIR}/test"
+ cp -R "${TEST_REPORT_DIR}" "${LATEST_DIR}/test"
cp -R build/reports/pmd "${RUN_DIR}/pmd"
cp -R build/reports/pmd "${LATEST_DIR}/pmd"
- cp -R build/reports/jacoco/test/html "${RUN_DIR}/coverage"
- cp -R build/reports/jacoco/test/html "${LATEST_DIR}/coverage"
+ cp -R "${JACOCO_REPORT_DIR}/html" "${RUN_DIR}/coverage"
+ cp -R "${JACOCO_REPORT_DIR}/html" "${LATEST_DIR}/coverage"
cp -R build/reports/pitest "${RUN_DIR}/pitest"
cp -R build/reports/pitest "${LATEST_DIR}/pitest"
@@ -178,7 +181,7 @@ jobs:
python3 \
./tools/generate-pages-badges.py \
- --jacoco-xml build/reports/jacoco/test/jacocoTestReport.xml \
+ --jacoco-xml "${JACOCO_REPORT_DIR}/jacocoCiReleaseReport.xml" \
--pit-xml build/reports/pitest/mutations.xml \
--jmh-csv build/reports/jmh/jmh-results.csv \
--run-metrics-dir "${RUN_METRICS_DIR}" \
@@ -228,7 +231,7 @@ jobs:
Build ${GITHUB_RUN_NUMBER} from commit ${GITHUB_SHA}
- Javadoc
- - Test Report
+ - Release Verification Test Report (ciRelease)
- PMD Report
- Coverage Report
${DEPENDENCY_CHECK_LINK:-- Dependency Vulnerability Report: not available
}
@@ -260,7 +263,7 @@ jobs:
- [Latest build summary](https://leogalambos.github.io/Radixor/builds/latest/)
- [Javadoc](https://leogalambos.github.io/Radixor/builds/latest/javadoc/)
- - [Unit test report](https://leogalambos.github.io/Radixor/builds/latest/test/)
+- [Release verification test report (ciRelease)](https://leogalambos.github.io/Radixor/builds/latest/test/)
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
diff --git a/README.md b/README.md
index 1291d0f..c7cda46 100644
--- a/README.md
+++ b/README.md
@@ -167,6 +167,9 @@ The repository keeps the front page concise and places detailed documentation un
- [Architecture](docs/architecture.md)
Structural model, data flow, and runtime lookup behavior.
+- [Lookup Edge Optimization](docs/lookup-edge-optimization.md)
+ Speed/memory trade-off of dense child edge lookup in compiled tries.
+
- [Reduction Semantics](docs/reduction-semantics.md)
Ranked, unordered, and dominant reduction behavior.
diff --git a/build.gradle b/build.gradle
index 3ac336d..bbf08ea 100644
--- a/build.gradle
+++ b/build.gradle
@@ -108,9 +108,19 @@ dependencyCheck {
}
}
-tasks.withType(Test).configureEach {
- useJUnitPlatform()
+def cliIncludeTags = project.findProperty('includeTags')?.toString() ?: System.getProperty('includeTags')
+def cliExcludeTags = project.findProperty('excludeTags')?.toString() ?: System.getProperty('excludeTags')
+def splitTagExpression = { String tagsExpr ->
+ if (tagsExpr == null || tagsExpr.isBlank()) {
+ return []
+ }
+ return tagsExpr.split(',')
+ .collect { it.trim() }
+ .findAll { it != null && !it.isBlank() }
+}
+
+tasks.withType(Test).configureEach {
doFirst {
jvmArgs "-javaagent:${configurations.mockitoAgent.singleFile}"
}
@@ -123,14 +133,127 @@ tasks.withType(Test).configureEach {
minHeapSize = '1g'
maxHeapSize = '4g'
- finalizedBy(tasks.named('jacocoTestReport'))
-
reports {
junitXml.required = true
html.required = true
}
}
+def configureJUnitPlatformTags = { Test task, String includeTagsExpr, String excludeTagsExpr ->
+ task.useJUnitPlatform {
+ final def includes = splitTagExpression(includeTagsExpr)
+ final def excludes = splitTagExpression(excludeTagsExpr)
+
+ if (!includes.isEmpty()) {
+ includeTags(*includes.toArray(new String[0]))
+ }
+ if (!excludes.isEmpty()) {
+ excludeTags(*excludes.toArray(new String[0]))
+ }
+ }
+}
+
+tasks.named('test', Test) {
+ configureJUnitPlatformTags(it, cliIncludeTags, cliExcludeTags)
+ finalizedBy(tasks.named('jacocoTestReport'))
+}
+
+def configureTaggedTestProfile = { String taskName, String includeTagsExpr, String excludeTagsExpr = null,
+ String taskDescription = null, String testNameExcludePatterns = null ->
+ tasks.register(taskName, Test) {
+ group = 'verification'
+ description = taskDescription
+
+ configureJUnitPlatformTags(delegate as Test, includeTagsExpr, excludeTagsExpr)
+ testClassesDirs = sourceSets.test.output.classesDirs
+ classpath = sourceSets.test.runtimeClasspath
+ dependsOn(tasks.named('compileTestJava'))
+
+ doFirst {
+ jvmArgs "-javaagent:${configurations.mockitoAgent.singleFile}"
+ }
+
+ if (testNameExcludePatterns != null && !testNameExcludePatterns.isBlank()) {
+ filter {
+ testNameExcludePatterns.split(',').each { String pattern ->
+ final def trimmedPattern = pattern.trim()
+ if (!trimmedPattern.isEmpty()) {
+ excludeTestsMatching(trimmedPattern)
+ }
+ }
+ }
+ }
+
+ minHeapSize = '1g'
+ maxHeapSize = '4g'
+
+ reports {
+ junitXml.required = true
+ html.required = true
+ }
+ }
+}
+
+configureTaggedTestProfile(
+ 'ciSmoke',
+ 'unit',
+ 'slow',
+ 'Fast feedback profile for unit tests with slow tests explicitly excluded.',
+ 'org.egothor.stemmer.CompileIntegrationTest*'
+)
+
+configureTaggedTestProfile(
+ 'ciCore',
+ 'unit,trie,frequency-trie,property',
+ null,
+ 'Focused profile for core trie behavior and trie-specific property checks.'
+)
+
+configureTaggedTestProfile(
+ 'ciIntegration',
+ 'integration',
+ 'slow',
+ 'Integration pipeline profile (loader/parser/CLI/IO end-to-end flows) excluding slow integration paths.'
+)
+
+configureTaggedTestProfile(
+ 'ciSlow',
+ 'slow',
+ null,
+ 'Targeted profile for all slow tests (large dictionaries, long-running corpus validation, and heavy integration checks).'
+)
+
+configureTaggedTestProfile(
+ 'ciCompat',
+ 'compat,regression',
+ null,
+ 'Compatibility profile guarding persisted artifact and compatibility regressions.'
+)
+
+configureTaggedTestProfile(
+ 'ciRelease',
+ null,
+ 'slow',
+ 'Release-profile validation of all non-slow tests.',
+ 'org.egothor.stemmer.CompileIntegrationTest*,org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*'
+)
+
+configureTaggedTestProfile(
+ 'ciNightly',
+ 'fuzz',
+ null,
+ 'Nightly robustness profile with fuzz testing emphasis.'
+)
+
+tasks.register('ci') {
+ group = 'verification'
+ description = 'Runs the full enterprise CI profile set in sequence.'
+ dependsOn(tasks.named('ciSmoke'))
+ dependsOn(tasks.named('ciCore'))
+ dependsOn(tasks.named('ciIntegration'))
+ dependsOn(tasks.named('ciCompat'))
+}
+
tasks.withType(Pmd).configureEach {
reports {
xml.required = true
@@ -155,6 +278,36 @@ tasks.named('jacocoTestReport', JacocoReport) {
}
}
+def registerJacocoProfileReport = { String reportTaskName, String sourceTaskName ->
+ tasks.register(reportTaskName, JacocoReport) {
+ group = 'verification'
+ description = "Generates Jacoco report for ${sourceTaskName} execution."
+
+ dependsOn(tasks.named(sourceTaskName))
+
+ classDirectories.setFrom(
+ files(sourceSets.main.output).asFileTree.matching {
+ exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*'
+ exclude 'org/egothor/stemmer/DiacriticStripper*'
+ }
+ )
+
+ executionData.setFrom(
+ fileTree(layout.buildDirectory.dir('jacoco')) {
+ include "${sourceTaskName}.exec"
+ }
+ )
+
+ reports {
+ xml.required = true
+ csv.required = false
+ html.required = true
+ }
+ }
+}
+
+registerJacocoProfileReport('jacocoCiReleaseReport', 'ciRelease')
+
tasks.named('check') {
dependsOn(tasks.named('jacocoTestReport'))
// no-default, only on-demand: dependsOn(tasks.named('dependencyCheckAnalyze'))
diff --git a/docs/lookup-edge-optimization.md b/docs/lookup-edge-optimization.md
new file mode 100644
index 0000000..f98e8c3
--- /dev/null
+++ b/docs/lookup-edge-optimization.md
@@ -0,0 +1,193 @@
+# Lookup Edge Optimization
+
+Compiled trie nodes (`CompiledNode`) use three lookup strategies when resolving child edges:
+
+1. dense array direct lookup,
+2. linear scan for very small child counts,
+3. binary search over sorted edge labels.
+
+This page explains the dense path, what `maxExpandedIndex` controls, and how to tune it.
+
+## Runtime model of one node
+
+For a node with sorted edge labels `char[] edges`, the implementation can materialize an
+index-aligned dense table when labels occupy a small compact code-point interval:
+
+```text
+span = maxEdge - minEdge
+use dense table iff (span <= maxExpandedIndex) and (maxExpandedIndex > 0)
+```
+
+When dense lookup is used, lookup is constant-time indexing:
+
+```text
+denseIndex = requestedEdge - minEdge
+return denseChildren[denseIndex] // or null if outside interval
+```
+
+When dense lookup is not active (interval is too wide or the configured
+`maxExpandedIndex` is `0`), `CompiledNode` still chooses between two fallback
+strategies:
+
+- **linear scan** for very small child counts (`4` or fewer children),
+- **binary search** for larger child counts.
+
+This means the fallback method is selected by child count, not by “distance” alone.
+`linear scan` is therefore used when there are only a few edges even if those edges are
+spread across very distant code points.
+
+### Example: few edges, wide Unicode span
+
+```text
+edges = ['a', '中', '你']
+edge count = 3
+minEdge = 'a' (U+0061)
+maxEdge = '你' (U+4F60)
+span = 20319
+```
+
+- If `maxExpandedIndex = 512`, dense indexing is not used because `span > maxExpandedIndex`.
+- Because `edge count = 3` (<= 4), lookup falls back to a tiny linear scan of the
+ three labels.
+- This is exactly the case where you get benefit from the threshold even though the interval is wide.
+
+This is useful for non-Latin scripts as well: what matters is interval width in Unicode
+code points, not script name. A compact Arabic-range block can still benefit from dense
+lookups when keys stay in a tight code-point interval.
+
+## Why this is configurable
+
+`maxExpandedIndex` is only a performance/paging choice:
+
+- higher value:
+ - more compact intervals qualify for dense tables,
+ - more constant-time child lookup,
+ - more memory for dense tables in qualifying nodes.
+- lower value (or `0`):
+ - less dense-table allocation,
+ - fewer branches into constant-time path,
+ - lower materialization memory.
+
+The value never changes lookup semantics. It only changes the in-memory structure shape.
+
+## Persistence and loading model
+
+This threshold is **not** stored in `TrieMetadata`.
+
+- The binary format stores only trie payload and semantic metadata (`reduction`, `traversal`,
+ case/diacritic settings, and stream version).
+- `maxExpandedIndex` is chosen when materializing nodes in memory.
+- You can therefore keep one persisted artifact and load it with different in-memory
+ trade-offs depending on deployment constraints.
+
+## Default
+
+- `FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX == 512`
+- `CompiledNode.DEFAULT_MAX_EXPANDED_INDEX == 512`
+
+These are practical defaults for mixed-language text and Latin-like scripts where edge labels
+often cluster.
+
+## Tune during build (writable phase)
+
+Use the full `FrequencyTrie.Builder` constructor when you are compiling from source data.
+The builder threshold is applied while freezing reduced nodes into the immutable form.
+
+```java
+import org.egothor.stemmer.CaseProcessingMode;
+import org.egothor.stemmer.DiacriticProcessingMode;
+import org.egothor.stemmer.FrequencyTrie;
+import org.egothor.stemmer.ReductionMode;
+import org.egothor.stemmer.ReductionSettings;
+import org.egothor.stemmer.WordTraversalDirection;
+
+final ReductionSettings settings = ReductionSettings.withDefaults(
+ ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
+
+final FrequencyTrie.Builder fastBuilder =
+ new FrequencyTrie.Builder<>(String[]::new,
+ settings,
+ WordTraversalDirection.BACKWARD,
+ CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
+ DiacriticProcessingMode.AS_IS,
+ 1024); // prefer lookup speed
+
+// ... put(...) ...
+final FrequencyTrie trie = fastBuilder.build();
+```
+
+Use `0` or `256` for lower memory while still building larger tries.
+
+```java
+final FrequencyTrie.Builder compactBuilder =
+ new FrequencyTrie.Builder<>(String[]::new,
+ settings,
+ WordTraversalDirection.BACKWARD,
+ CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
+ DiacriticProcessingMode.AS_IS,
+ 256); // lower memory profile
+```
+
+## Tune when loading a binary artifact (runtime phase)
+
+At artifact load time, you can tune the same trade-off independently of persisted metadata.
+
+```java
+import java.nio.file.Path;
+
+import org.egothor.stemmer.StemmerPatchTrieLoader;
+
+var defaultLookup = StemmerPatchTrieLoader.loadBinary(
+ Path.of("stemmers", "english.radixor.gz"));
+
+var fastLookup = StemmerPatchTrieLoader.loadBinary(
+ Path.of("stemmers", "english.radixor.gz"), 1024);
+
+var compactLookup = StemmerPatchTrieLoader.loadBinary(
+ Path.of("stemmers", "english.radixor.gz"), 0);
+```
+
+You can also set the threshold directly with `FrequencyTrie.readFrom(...)` when reading streams:
+
+```java
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.zip.GZIPInputStream;
+
+import org.egothor.stemmer.FrequencyTrie;
+
+public final class StreamLoadExample {
+
+ private StreamLoadExample() {
+ throw new AssertionError("No instances.");
+ }
+
+ public static void main(final String[] arguments) throws IOException {
+ try (InputStream fileInput = Files.newInputStream(Path.of("stemmers", "english.radixor.gz"));
+ GZIPInputStream gzip = new GZIPInputStream(fileInput);
+ DataInputStream dataInput = new DataInputStream(gzip)) {
+ final FrequencyTrie compactOnLoad = FrequencyTrie.readFrom(
+ dataInput,
+ String[]::new,
+ input -> input.readUTF(),
+ 256);
+ }
+ }
+}
+```
+
+Note: the string codec is intentionally inline in this snippet to keep it self-contained.
+
+## Practical guidance
+
+- Start with default (`512`) in production and profile before changing it.
+- Use `0` when memory is the priority and query throughput is not the bottleneck.
+- Use values around `1024` for workloads dominated by compact alphabets and very hot lookups.
+
+Trade-off expectation:
+
+- increasing `maxExpandedIndex` improves lookup speed when edges tend to occupy short spans,
+- decreasing it reduces per-node auxiliary memory in dense-span nodes.
diff --git a/docs/programmatic-loading-and-building.md b/docs/programmatic-loading-and-building.md
index 45c93d3..6269cda 100644
--- a/docs/programmatic-loading-and-building.md
+++ b/docs/programmatic-loading-and-building.md
@@ -87,6 +87,43 @@ public final class LoadBinaryExample {
The binary format is the native `FrequencyTrie` serialization wrapped in GZip compression. It includes persisted `TrieMetadata`, so lookup after loading uses the traversal, case-processing, diacritic-processing, and reduction settings captured when the trie was compiled.
+## Tune child lookup density when loading binaries
+
+To optimize hot-path latency, you can tune direct child indexing by passing `maxExpandedIndex`
+at load time. This does not change persisted metadata, only the materialized in-memory form.
+
+```java
+import java.io.IOException;
+import java.nio.file.Path;
+
+import org.egothor.stemmer.FrequencyTrie;
+import org.egothor.stemmer.StemmerPatchTrieLoader;
+
+public final class LoadBinaryWithDenseLookupExample {
+
+ private LoadBinaryWithDenseLookupExample() {
+ throw new AssertionError("No instances.");
+ }
+
+ public static void main(final String[] arguments) throws IOException {
+ final FrequencyTrie balanced = StemmerPatchTrieLoader.loadBinary(
+ Path.of("stemmers", "english.radixor.gz"));
+
+ final FrequencyTrie fast = StemmerPatchTrieLoader.loadBinary(
+ Path.of("stemmers", "english.radixor.gz"),
+ 1024);
+
+ final FrequencyTrie compact = StemmerPatchTrieLoader.loadBinary(
+ Path.of("stemmers", "english.radixor.gz"),
+ 0);
+ }
+}
+```
+
+Negative values still use `FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX`.
+
+[Lookup Edge Optimization](lookup-edge-optimization.md) describes the trade-off in detail and examples for build-time tuning as well.
+
## Build directly with a mutable builder
A `FrequencyTrie.Builder` accepts repeated `put(key, value)` calls and compiles the final read-only trie through `build()`. Compilation performs bottom-up reduction and produces the compact immutable runtime representation.
diff --git a/docs/programmatic-usage.md b/docs/programmatic-usage.md
index da2dfc6..2edbd9b 100644
--- a/docs/programmatic-usage.md
+++ b/docs/programmatic-usage.md
@@ -25,6 +25,7 @@ This is why Radixor can generalize beyond explicitly listed forms and why compil
The programmatic API is easier to understand when split by developer task:
- [Loading and Building Stemmers](programmatic-loading-and-building.md) explains how to acquire a compiled stemmer from bundled resources, textual dictionaries, binary artifacts, or direct builder usage.
+- [Lookup Edge Optimization](lookup-edge-optimization.md) explains dense child lookup tuning and the speed/memory trade-off when materializing compiled tries.
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md) explains `get(...)`, `getAll(...)`, `getEntries(...)`, patch application, and the practical meaning of reduction modes.
- [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md) explains how to reopen compiled tries, add new lexical data, rebuild them, and store them as binary artifacts.
diff --git a/docs/quality-and-operations.md b/docs/quality-and-operations.md
index 8baca34..ce1d490 100644
--- a/docs/quality-and-operations.md
+++ b/docs/quality-and-operations.md
@@ -58,6 +58,27 @@ A deterministic system is easier to test, easier to reason about, and safer to i
The project is intended to maintain very high confidence in both core correctness and behavioral stability.
+The recommended execution strategy is defined by the tagged test profiles in [Test taxonomy and execution filtering](test-taxonomy-and-filtering.md). In practice, teams can execute profile tasks directly:
+
+- `./gradlew ciSmoke`: fast local/PR safety checks (`unit`, excluding `slow`; additionally excludes
+ `CompileIntegrationTest` as a defensive safeguard).
+- `./gradlew ciSlow`: enterprise heavy gate for all tests marked with `slow` (typically
+ production dictionary and large corpus verification). This should be used for scheduled/manual
+ hardening gates and not in standard release build.
+- `./gradlew ciCore`: behavioral coverage of trie and frequency-trie paths (`unit` + `property` where applicable)
+- `./gradlew ciIntegration`: pipeline and CLI integration path checks
+- `./gradlew ciCompat`: compatibility and regression verification for persisted artifacts
+- `./gradlew ciRelease`: full non-slow suite for release-confidence runs (all test tags except `slow`,
+ plus explicit name-based exclusion of `CompileIntegrationTest*` and
+ `StemmerPatchTrieLoaderTest$BundledDictionaryTests*` as additional guardrails)
+- `./gradlew ciNightly`: extended fuzz profile for robustness hardening
+- `./gradlew ci`: umbrella profile depending on smoke/core/integration/compat
+
+## Test taxonomy and execution filtering
+
+The full tag taxonomy and executable filter examples are documented in
+[Test taxonomy and execution filtering](test-taxonomy-and-filtering.md).
+
### Structural coverage
High code coverage is treated as a useful signal, but not as a sufficient goal on its own. Coverage is valuable only when the covered scenarios actually pressure the implementation in meaningful ways.
diff --git a/docs/quick-start.md b/docs/quick-start.md
index 06fb887..71010d2 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -67,6 +67,36 @@ public final class LoadBinaryStemmerExample {
}
```
+You can tune in-memory child lookup density at load time without changing the artifact:
+
+```java
+import java.io.IOException;
+import java.nio.file.Path;
+
+import org.egothor.stemmer.FrequencyTrie;
+import org.egothor.stemmer.StemmerPatchTrieLoader;
+
+public final class LoadBinaryStemmerExampleTuned {
+
+ private LoadBinaryStemmerExampleTuned() {
+ throw new AssertionError("No instances.");
+ }
+
+ public static void main(final String[] arguments) throws IOException {
+ final FrequencyTrie fast = StemmerPatchTrieLoader.loadBinary(
+ Path.of("stemmers", "english.radixor.gz"),
+ 1024);
+ final FrequencyTrie compact = StemmerPatchTrieLoader.loadBinary(
+ Path.of("stemmers", "english.radixor.gz"),
+ 128);
+
+ System.out.println("fast=" + fast.size() + ", compact=" + compact.size());
+ }
+}
+```
+
+For the trade-off details, see [Lookup Edge Optimization](lookup-edge-optimization.md).
+
### Build or extend a stemmer from dictionary data
Radixor can also build a compiled trie from a custom dictionary. Dictionary lines consist of a canonical stem followed by zero or more variants. The input may be plain UTF-8 text or GZip-compressed UTF-8 text when loaded from a filesystem path. The parser applies `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), ignores leading and trailing whitespace around columns, supports line remarks introduced by `#` or `//`, and skips dictionary items that contain embedded whitespace.
diff --git a/docs/reports.md b/docs/reports.md
index b5b2930..bb0a467 100644
--- a/docs/reports.md
+++ b/docs/reports.md
@@ -23,7 +23,7 @@ These reports are primarily useful when reviewing the published API surface and
These reports describe the outcome of core verification and static-analysis stages for the latest published build:
-- [Unit test report](https://leogalambos.github.io/Radixor/builds/latest/test/)
+- [Release verification test report (ciRelease)](https://leogalambos.github.io/Radixor/builds/latest/test/)
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
diff --git a/docs/test-taxonomy-and-filtering.md b/docs/test-taxonomy-and-filtering.md
new file mode 100644
index 0000000..8668de4
--- /dev/null
+++ b/docs/test-taxonomy-and-filtering.md
@@ -0,0 +1,216 @@
+# Test Tag Taxonomy and Execution Guide
+
+Radixor uses JUnit tags as an explicit execution policy for its test suite.
+
+The project uses three orthogonal axes:
+
+1. **Scope** (how the test is executed in the pipeline)
+2. **Domain** (where in the system it belongs)
+3. **Intent** (what behavior it verifies)
+
+## Canonical scope tags
+
+| Tag | Description | Typical usage |
+| --- | --- | --- |
+| `unit` | Fast, deterministic tests that exercise a specific class or behavior without external processes. | Default developer feedback; should stay near-zero flakiness and low run time. |
+| `integration` | Tests that span multiple components or end-to-end flows of the public pipeline. | Parser/loader/CLI/IO integration checks and multi-step compile-then-load validations. |
+| `property` | Property-based tests with generator-driven coverage for invariants. | Semantics-preserving laws and edge-case exploration beyond curated fixtures. |
+| `fuzz` | Randomized stress checks with bounded runtime. | Heavier probabilistic verification of robustness and reduction invariants. |
+| `compat` | Backward/forward compatibility and reproducibility checks for persisted artifacts. | Artifact fingerprints, deterministic rebuild, and regression fixtures. |
+| `slow` | Long-running or expensive tests that should not execute in every fast gate. | Heavy fuzz/property budgets or high-duration integration checks. |
+
+## Canonical domain tags
+
+| Tag | Description | Typical usage |
+| --- | --- | --- |
+| `core` | Core algorithm and foundational platform behavior. | Traversal direction, base data structures, low-level helpers. |
+| `trie` | All mutable/compiled trie behaviors and traversal internals. | Lookup path selection, node shape, child representation, subtree behavior. |
+| `frequency-trie` | Algorithms and corner cases specific to frequency-aware trie logic. | Ranking, weighted reductions, persistence of weighted nodes. |
+| `stemmer` | End-user stemming pipeline semantics. | Parse-encode-apply flows and output invariants. |
+| `patch` | Patch encoding, decoding, and application semantics. | `PatchCommandEncoder` behavior and related compatibility contracts. |
+| `io` | Input/output and resource loading boundaries. | Filesystem readers, streams, and stream lifecycle handling. |
+| `serialization` | Binary persistence contract of compiled artifacts. | Versioned format reads/writes and checksum/consistency checks. |
+| `parser` | Dictionary and metadata parsing concerns. | Dictionary input parsing and malformed-source rejection. |
+| `cli` | Command-line entrypoint and command orchestration behavior. | Compile CLI integration and CLI argument validation. |
+| `metadata` | Trie metadata semantics, compatibility fields, and schema expectations. | Version flags, structural properties, and metadata round-trips. |
+| `compile` | Compile-time pipeline and build-oriented behavior. | Building, reduction-mode behavior, and compiled artifact generation. |
+| `diacritic` | Unicode diacritic normalization and stripping behavior. | Accent-removal correctness and locale-safe normalization checks. |
+
+## Canonical intent tags
+
+| Tag | Description | Typical usage |
+| --- | --- | --- |
+| `construction` | Tests around construction and assembly of runtime structures. | Builders, loaders, and compile-time object construction contracts. |
+| `lookup` | Read behavior and retrieval semantics. | `get()`, `getAll()`, traversal and missing-key behavior. |
+| `persistence` | Storage lifecycle semantics. | Serialization/deserialization and round-trip correctness. |
+| `reduction` | Reduction algorithm correctness and corner cases. | Dominance threshold, subtree deduplication, rank-preservation invariants. |
+| `encoding` | Encoding transformation direction. | `PatchCommandEncoder.encode` and serialized command form generation. |
+| `decoding` | Decoding/interpretation of persisted or runtime commands. | Optional consumers that parse and apply encoded command payloads. |
+| `apply` | Patch application and transformation behavior. | Verifies that applied patches produce expected derived forms. |
+| `normalization` | Canonicalization and cleanup behavior. | String normalization around case/shape and mirrored input paths. |
+| `validation` | Input rejection and defensive checks. | Null/empty/invalid contracts and explicit failure conditions. |
+| `regression` | Guard tests for behavior changes over time. | Known historical bugs and behavioral drift prevention. |
+| `determinism` | Repeatable results under fixed input and settings. | Compile determinism, stable ordering, and artifact reproducibility. |
+| `error-handling` | Exception surface and robustness expectations. | Recovery/failure modes and diagnostics quality. |
+
+## Class-level rules
+
+1. Every test class has **exactly one** scope tag.
+2. Every test class has at least one domain tag.
+3. Additional tags describe intent and may be used on classes or nested tests.
+4. For each test class, intent tags should reflect the primary behavior under test, not historical naming conventions.
+
+## Governance and execution policy
+
+The following rules are used to keep the suite auditable and stable:
+
+| Rule | Required state | Why |
+| --- | --- | --- |
+| Scope discipline | Exactly one scope tag per class. | Prevents accidental promotion of integration-only behavior into fast unit runs. |
+| Coverage breadth | At least one domain tag per class. | Ensures tests can be grouped by subsystem for targeted review. |
+| Intent specificity | Use at least one intent tag when behavior is non-trivial. | Makes failure triage faster and profile composition explicit. |
+| Runtime policy | Never run `slow` tests in the default `unit` profile unless explicitly required. | Preserves turnaround for PR feedback while preserving deep checks. |
+| Change risk | Any persistence or compatibility-affecting change must include `compat` in validation. | Protects long-lived binary artifact contracts. |
+| Mutation resistance | `fuzz`/`property` sets should be gated to dedicated profiles. | Limits flakiness exposure and controls CI resource cost. |
+
+## Suggested CI profiles
+
+These are recommended launch profiles for local and CI usage and are also exposed as Gradle tasks:
+
+- **Profile: `ci-smoke` (fast feedback):**
+
+```
+./gradlew test -DincludeTags=unit -DexcludeTags=slow
+./gradlew ciSmoke
+```
+
+`ciSmoke` also excludes `org.egothor.stemmer.CompileIntegrationTest*` at test-name filter level as a
+defensive fallback in case of future tag drift.
+`ciRelease` also excludes
+`org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*` at filter level.
+
+- **Profile: `ci-core` (core behavioral coverage):**
+
+```
+./gradlew test -DincludeTags=unit,trie,frequency-trie,property
+./gradlew ciCore
+```
+
+- **Profile: `ci-integration` (pipeline correctness):**
+
+```
+./gradlew test -DincludeTags=integration
+./gradlew ciIntegration
+```
+
+- **Profile: `ci-slow` (explicit heavy validation):**
+
+```
+./gradlew ciSlow
+```
+
+- **Profile: `ci-compat` (artifact stability):**
+
+```
+./gradlew test -DincludeTags=compat,regression
+./gradlew ciCompat
+```
+
+- **Profile: `ci-release` (strong confidence before release):**
+
+```
+./gradlew test -DexcludeTags=slow
+./gradlew ciRelease
+```
+`ciRelease` is non-slow by policy and uses the same defensive name-based exclusion for
+`org.egothor.stemmer.CompileIntegrationTest*` and
+`org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*` in addition to tag filtering.
+
+- **Profile: `ci-nightly` (extended hardening):**
+
+```
+./gradlew test -DincludeTags=fuzz
+./gradlew ciNightly
+```
+
+- **Profile: `ci` (enterprise umbrella):**
+
+```
+./gradlew ci
+```
+
+`ci` and `ciRelease` intentionally do **not** include `slow` paths. Run `ciSlow` explicitly for production-dictionary stress and long-running corpus checks.
+
+## Practical examples
+
+All examples use Gradle with JUnit Platform integration:
+
+- Only unit tests:
+
+```
+./gradlew test -DincludeTags=unit
+```
+
+- Integration tests only:
+
+```
+./gradlew test -DincludeTags=integration
+```
+
+- Only trie subsystem tests:
+
+```
+./gradlew test -DincludeTags=trie
+```
+
+- Deterministic fuzz checks:
+
+```
+./gradlew test -DincludeTags=fuzz
+```
+
+- Property tests:
+
+```
+./gradlew test -DincludeTags=property
+```
+
+- Stemmer + patch command behavior:
+
+```
+./gradlew test -DincludeTags=stemmer,patch
+```
+
+- Compatibility artifacts and regression checks:
+
+```
+./gradlew test -DincludeTags=compat
+```
+
+- Keep regression suite and remove long-running cases:
+
+```
+./gradlew test -DincludeTags=regression -DexcludeTags=slow
+```
+
+- Core + patch behavior:
+
+```
+./gradlew test -DincludeTags=trie,patch
+```
+
+- Deterministic compatibility and persistence checks:
+
+```
+./gradlew test -DincludeTags=compat,determinism,serialization
+```
+
+## Notes
+
+- `-DincludeTags` and `-DexcludeTags` are interpreted by Gradle task filtering and forwarded into
+ JUnit tag filtering.
+- Class-name filtering is also available via Gradle test selectors where needed
+ (for example, `--tests *CompileTest`), but tag filtering remains the default
+ execution strategy.
+- `-DincludeTags` supports comma-separated literal tags. When you need a single exact tag with special
+ characters, quote the argument for the shell.
diff --git a/gradle/maven-pom.gradle b/gradle/maven-pom.gradle
index 8b7a600..c7a8801 100644
--- a/gradle/maven-pom.gradle
+++ b/gradle/maven-pom.gradle
@@ -84,7 +84,7 @@ publishing {
}
signing {
- required { !version.toString().endsWith('-SNAPSHOT') }
+ required = !version.toString().endsWith('-SNAPSHOT')
if (signingKey != null && !signingKey.isBlank()) {
useInMemoryPgpKeys(signingKey, signingPassword)
sign publishing.publications.mavenJava
diff --git a/mkdocs.yml b/mkdocs.yml
index 6406df4..a69c699 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -54,6 +54,7 @@ nav:
- Overview: architecture-and-reduction.md
- Architecture: architecture.md
- Reduction Semantics: reduction-semantics.md
+ - Lookup Edge Optimization: lookup-edge-optimization.md
- Compatibility and Guarantees: compatibility-and-guarantees.md
- Dictionaries:
@@ -63,3 +64,4 @@ nav:
- Quality and Operations: quality-and-operations.md
- Benchmarking: benchmarking.md
- Reports: reports.md
+ - Test taxonomy and execution filtering: test-taxonomy-and-filtering.md
diff --git a/src/main/java/org/egothor/stemmer/FrequencyTrie.java b/src/main/java/org/egothor/stemmer/FrequencyTrie.java
index 4f2330e..c3a67b9 100644
--- a/src/main/java/org/egothor/stemmer/FrequencyTrie.java
+++ b/src/main/java/org/egothor/stemmer/FrequencyTrie.java
@@ -51,7 +51,6 @@ import java.util.logging.Logger;
import org.egothor.stemmer.trie.CompiledNode;
import org.egothor.stemmer.trie.LocalValueSummary;
import org.egothor.stemmer.trie.MutableNode;
-import org.egothor.stemmer.trie.NodeData;
import org.egothor.stemmer.trie.ReducedNode;
import org.egothor.stemmer.trie.ReductionContext;
import org.egothor.stemmer.trie.ReductionSignature;
@@ -87,7 +86,6 @@ import org.egothor.stemmer.trie.ReductionSignature;
*
* @param value type
*/
-@SuppressWarnings("PMD.CyclomaticComplexity")
public final class FrequencyTrie {
/**
@@ -130,11 +128,54 @@ public final class FrequencyTrie {
*/
private static final int STREAM_MAGIC = 0x45475452;
+ /**
+ * Minimum supported stream version constant retained for explicit range checks.
+ */
+ private static final int MIN_STREAM_VERSION = 1;
+
+ /**
+ * Number of stored values for which {@link #getEntries(String)} can return an
+ * empty result.
+ */
+ private static final int NO_VALUE_COUNT = 0;
+
+ /**
+ * Number of stored values for which {@link #getEntries(String)} can use a
+ * one-item immutable list special case.
+ */
+ private static final int SINGLE_VALUE_COUNT = 1;
+
/**
* Binary format version.
*/
private static final int STREAM_VERSION = 5;
+ /**
+ * Version where traversal-direction ordinal is persisted.
+ */
+ private static final int TRAVERSAL_VERSION = 2;
+
+ /**
+ * Version where compact reduction metadata is persisted.
+ */
+ private static final int REDUCTION_VERSION = 3;
+
+ /**
+ * Version where case-processing mode ordinal is persisted.
+ */
+ private static final int CASE_VERSION = 4;
+
+ /**
+ * Default dense child lookup span in code points used when materializing
+ * compiled nodes without an explicit override.
+ *
+ * Increasing this value increases the chance of direct array indexing for
+ * child lookup at runtime at the cost of per-node dense table memory for
+ * compact character spans.
+ *
+ */
+ public static final int DEFAULT_MAX_EXPANDED_INDEX = 512;
+
/**
* Returns the current persisted binary stream format version.
*
@@ -259,7 +300,6 @@ public final class FrequencyTrie {
* if the key does not exist or no value is stored at the addressed node
* @throws NullPointerException if {@code key} is {@code null}
*/
- @SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
public List> getEntries(final String key) {
Objects.requireNonNull(key, "key");
final CompiledNode node = findNode(normalizeLookupKey(key));
@@ -269,11 +309,11 @@ public final class FrequencyTrie {
final V[] orderedValues = node.orderedValues();
final int valueCount = orderedValues.length;
- if (valueCount == 0) {
+ if (valueCount == NO_VALUE_COUNT) {
return List.of();
}
- if (valueCount == 1) {
+ if (valueCount == SINGLE_VALUE_COUNT) {
return List.of(new ValueCount<>(orderedValues[0], node.orderedCounts()[0]));
}
@@ -383,47 +423,31 @@ public final class FrequencyTrie {
*/
public static FrequencyTrie readFrom(final InputStream inputStream, final IntFunction arrayFactory,
final ValueStreamCodec valueCodec) throws IOException {
- Objects.requireNonNull(inputStream, "inputStream");
- Objects.requireNonNull(arrayFactory, "arrayFactory");
- Objects.requireNonNull(valueCodec, "valueCodec");
+ return readFrom(inputStream, arrayFactory, valueCodec, -1);
+ }
- final DataInputStream dataInput; // NOPMD
- if (inputStream instanceof DataInputStream) {
- dataInput = (DataInputStream) inputStream;
- } else {
- dataInput = new DataInputStream(inputStream);
- }
-
- final int magic = dataInput.readInt();
- if (magic != STREAM_MAGIC) {
- throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic));
- }
-
- final int version = dataInput.readInt();
- if (version != 1 && version != 3 && version != 4 && version != STREAM_VERSION) {
- throw new IOException("Unsupported trie stream version: " + version);
- }
-
- final int nodeCount = dataInput.readInt();
- if (nodeCount < 0) {
- throw new IOException("Negative node count: " + nodeCount);
- }
-
- final int rootNodeId = dataInput.readInt();
- if (rootNodeId < 0 || rootNodeId >= nodeCount) {
- throw new IOException("Invalid root node id: " + rootNodeId);
- }
-
- final TrieMetadata metadata = readMetadata(dataInput, version);
-
- final CompiledNode[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount);
- final CompiledNode rootNode = nodes[rootNodeId];
-
- if (LOGGER.isLoggable(Level.FINE)) {
- LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount);
- }
-
- return new FrequencyTrie<>(arrayFactory, rootNode, metadata);
+ /**
+ * Reads a compiled trie from the supplied input stream, optionally overriding
+ * dense child-index span configuration.
+ *
+ * This setting is applied only while materializing the in-memory compiled
+ * representation during load. It is not serialized in {@link TrieMetadata},
+ * so each load can independently choose its own runtime lookup trade-off.
+ *
+ *
+ * @param inputStream source input stream
+ * @param arrayFactory array factory used to create typed arrays
+ * @param valueCodec codec used to read values
+ * @param maxExpandedIndex dense lookup span override; zero disables dense lookup,
+ * negative values use {@link #DEFAULT_MAX_EXPANDED_INDEX}
+ * @param value type
+ * @return deserialized compiled trie
+ * @throws NullPointerException if any argument is {@code null}
+ * @throws IOException if reading fails or the binary format is invalid
+ */
+ public static FrequencyTrie readFrom(final InputStream inputStream, final IntFunction arrayFactory,
+ final ValueStreamCodec valueCodec, final int maxExpandedIndex) throws IOException {
+ return CompiledTrieReader.read(inputStream, arrayFactory, valueCodec, maxExpandedIndex);
}
/**
@@ -438,73 +462,6 @@ public final class FrequencyTrie {
dataOutput.writeUTF(metadata.toTextBlock());
}
- /**
- * Reads persisted trie metadata while remaining backward compatible with
- * earlier stream versions.
- *
- * @param dataInput input stream
- * @param version persisted stream version
- * @return deserialized metadata
- * @throws IOException if the metadata section is invalid
- */
- private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException {
- if (version >= 5) { // NOPMD
- try {
- return TrieMetadata.fromTextBlock(version, dataInput.readUTF());
- } catch (IllegalArgumentException exception) {
- throw new IOException("Invalid metadata block.", exception);
- }
- }
-
- final WordTraversalDirection traversalDirection;
- if (version >= 2) { // NOPMD
- final int traversalDirectionOrdinal = dataInput.readInt();
- final WordTraversalDirection[] traversalDirections = WordTraversalDirection.values();
- if (traversalDirectionOrdinal < 0 || traversalDirectionOrdinal >= traversalDirections.length) {
- throw new IOException("Invalid traversal direction ordinal: " + traversalDirectionOrdinal);
- }
- traversalDirection = traversalDirections[traversalDirectionOrdinal];
- } else {
- traversalDirection = WordTraversalDirection.BACKWARD;
- }
-
- if (version < 3) { // NOPMD
- return TrieMetadata.legacy(version, traversalDirection);
- }
-
- final ReductionMode[] reductionModes = ReductionMode.values();
- final int reductionModeOrdinal = dataInput.readInt();
- if (reductionModeOrdinal < 0 || reductionModeOrdinal >= reductionModes.length) {
- throw new IOException("Invalid reduction mode ordinal: " + reductionModeOrdinal);
- }
-
- final int dominantWinnerMinPercent = dataInput.readInt();
- final int dominantWinnerOverSecondRatio = dataInput.readInt(); // NOPMD
-
- final DiacriticProcessingMode[] diacriticProcessingModes = DiacriticProcessingMode.values();
- final int diacriticProcessingModeOrdinal = dataInput.readInt(); // NOPMD
- if (diacriticProcessingModeOrdinal < 0 || diacriticProcessingModeOrdinal >= diacriticProcessingModes.length) {
- throw new IOException("Invalid diacritic processing mode ordinal: " + diacriticProcessingModeOrdinal);
- }
-
- final CaseProcessingMode caseProcessingMode;
- if (version >= 4) { // NOPMD
- final CaseProcessingMode[] caseProcessingModes = CaseProcessingMode.values();
- final int caseProcessingModeOrdinal = dataInput.readInt();
- if (caseProcessingModeOrdinal < 0 || caseProcessingModeOrdinal >= caseProcessingModes.length) {
- throw new IOException("Invalid case processing mode ordinal: " + caseProcessingModeOrdinal);
- }
- caseProcessingMode = caseProcessingModes[caseProcessingModeOrdinal];
- } else {
- caseProcessingMode = CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
- }
-
- return new TrieMetadata(version, traversalDirection,
- new ReductionSettings(reductionModes[reductionModeOrdinal], dominantWinnerMinPercent,
- dominantWinnerOverSecondRatio),
- diacriticProcessingModes[diacriticProcessingModeOrdinal], caseProcessingMode);
- }
-
/**
* Returns the number of canonical compiled nodes reachable from the root.
*
@@ -574,103 +531,218 @@ public final class FrequencyTrie {
}
/**
- * Reads all compiled nodes and resolves child references.
- *
- * @param dataInput input
- * @param arrayFactory array factory
- * @param valueCodec value codec
- * @param nodeCount number of nodes
- * @param value type
- * @return array of nodes indexed by serialized node identifier
- * @throws IOException if reading fails or the stream is invalid
- */
- @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops")
- private static CompiledNode[] readNodes(final DataInputStream dataInput, final IntFunction arrayFactory,
- final ValueStreamCodec valueCodec, final int nodeCount) throws IOException {
- final List> nodeDataList = new ArrayList<>(nodeCount);
-
- for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
- final int edgeCount = dataInput.readInt();
- if (edgeCount < 0) {
- throw new IOException("Negative edge count at node " + nodeIndex + ": " + edgeCount);
- }
-
- final char[] edgeLabels = new char[edgeCount];
- final int[] childNodeIds = new int[edgeCount];
-
- for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
- edgeLabels[edgeIndex] = dataInput.readChar();
- childNodeIds[edgeIndex] = dataInput.readInt();
- }
-
- validateSerializedEdges(nodeIndex, edgeLabels);
-
- final int valueCount = dataInput.readInt();
- if (valueCount < 0) {
- throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
- }
-
- final V[] orderedValues = arrayFactory.apply(valueCount);
- final int[] orderedCounts = new int[valueCount];
-
- for (int valueIndex = 0; valueIndex < valueCount; valueIndex++) {
- orderedValues[valueIndex] = valueCodec.read(dataInput);
- orderedCounts[valueIndex] = dataInput.readInt();
- if (orderedCounts[valueIndex] <= 0) {
- throw new IOException("Non-positive stored count at node " + nodeIndex + ", value index "
- + valueIndex + ": " + orderedCounts[valueIndex]);
- }
- }
-
- nodeDataList.add(new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts));
- }
-
- @SuppressWarnings("unchecked")
- final CompiledNode[] nodes = new CompiledNode[nodeCount];
-
- for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
- final NodeData nodeData = nodeDataList.get(nodeIndex);
- @SuppressWarnings("unchecked")
- final CompiledNode[] children = new CompiledNode[nodeData.childNodeIds().length];
- nodes[nodeIndex] = new CompiledNode<>(nodeData.edgeLabels(), children, nodeData.orderedValues(),
- nodeData.orderedCounts());
- }
-
- for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
- final NodeData nodeData = nodeDataList.get(nodeIndex);
- final CompiledNode node = nodes[nodeIndex];
-
- for (int edgeIndex = 0; edgeIndex < nodeData.childNodeIds().length; edgeIndex++) {
- final int childNodeId = nodeData.childNodeIds()[edgeIndex];
- if (childNodeId < 0 || childNodeId >= nodeCount) {
- throw new IOException("Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex
- + ": " + childNodeId);
- }
- node.children()[edgeIndex] = nodes[childNodeId];
- }
- }
-
- return nodes;
- }
-
- /**
- * Validates the serialized edge-label sequence for one node.
+ * Internal helper that materializes serialized trie data.
*
*
- * Compiled nodes rely on binary search for child lookup and therefore require
- * edge labels to be stored in strict ascending order without duplicates.
- * Rejecting malformed streams here keeps lookup semantics deterministic and
- * avoids silently constructing a trie whose search behavior would be undefined.
- *
- * @param nodeIndex serialized node identifier
- * @param edgeLabels serialized edge labels
- * @throws IOException if the edge labels are not strictly ascending
+ * Moving reader complexity into this helper keeps the public-facing class from
+ * accumulating excessive class-level cyclomatic complexity while preserving the
+ * same binary compatibility contract.
+ *
*/
- private static void validateSerializedEdges(final int nodeIndex, final char... edgeLabels) throws IOException {
- for (int edgeIndex = 1; edgeIndex < edgeLabels.length; edgeIndex++) {
- if (edgeLabels[edgeIndex - 1] >= edgeLabels[edgeIndex]) {
- throw new IOException("Edge labels must be strictly ascending at node " + nodeIndex + ", edge index "
- + edgeIndex + ": '" + edgeLabels[edgeIndex - 1] + "' then '" + edgeLabels[edgeIndex] + "'.");
+ private static final class CompiledTrieReader {
+
+ private static FrequencyTrie read(final InputStream inputStream, final IntFunction arrayFactory,
+ final ValueStreamCodec valueCodec, final int maxExpandedIndex) throws IOException {
+ Objects.requireNonNull(inputStream, "inputStream");
+ Objects.requireNonNull(arrayFactory, "arrayFactory");
+ Objects.requireNonNull(valueCodec, "valueCodec");
+ if (maxExpandedIndex < -1) {
+ throw new IllegalArgumentException("maxExpandedIndex must be >= -1.");
+ }
+
+ final DataInputStream dataInput = wrapInputStream(inputStream);
+ final int magic = dataInput.readInt();
+ if (magic != STREAM_MAGIC) {
+ throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic));
+ }
+
+ final int version = dataInput.readInt();
+ if (version < MIN_STREAM_VERSION || version > STREAM_VERSION) {
+ throw new IOException("Unsupported trie stream version: " + version);
+ }
+
+ final int nodeCount = dataInput.readInt();
+ if (nodeCount < 0) {
+ throw new IOException("Negative node count: " + nodeCount);
+ }
+
+ final int rootNodeId = dataInput.readInt();
+ if (rootNodeId < 0 || rootNodeId >= nodeCount) {
+ throw new IOException("Invalid root node id: " + rootNodeId);
+ }
+
+ final TrieMetadata sourceMetadata = readMetadata(dataInput, version);
+ final int effectiveMaxExpandedIndex = maxExpandedIndex >= 0 ? maxExpandedIndex : DEFAULT_MAX_EXPANDED_INDEX;
+ final CompiledNode[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount, effectiveMaxExpandedIndex);
+ final CompiledNode rootNode = nodes[rootNodeId];
+
+ if (LOGGER.isLoggable(Level.FINE)) {
+ LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount);
+ }
+
+ return new FrequencyTrie<>(arrayFactory, rootNode, sourceMetadata);
+ }
+
+ private static DataInputStream wrapInputStream(final InputStream inputStream) {
+ return inputStream instanceof DataInputStream
+ ? (DataInputStream) inputStream
+ : new DataInputStream(inputStream);
+ }
+
+ private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException {
+ if (version == STREAM_VERSION) {
+ return readTextMetadata(dataInput);
+ }
+
+ final WordTraversalDirection traversalDirection = readTraversalDirection(dataInput, version);
+ if (version < REDUCTION_VERSION) {
+ return TrieMetadata.legacy(version, traversalDirection);
+ }
+
+ final ReductionSettings reductionSettings = readReductionSettings(dataInput);
+ final DiacriticProcessingMode diacriticProcessingMode = readEnumByOrdinal(dataInput, DiacriticProcessingMode.values(),
+ "diacritic processing mode");
+ final CaseProcessingMode caseProcessingMode = version >= CASE_VERSION
+ ? readCaseProcessingMode(dataInput)
+ : CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
+ return new TrieMetadata(version, traversalDirection, reductionSettings, diacriticProcessingMode, caseProcessingMode);
+ }
+
+ private static TrieMetadata readTextMetadata(final DataInputStream dataInput) throws IOException {
+ try {
+ return TrieMetadata.fromTextBlock(STREAM_VERSION, dataInput.readUTF());
+ } catch (IllegalArgumentException exception) {
+ throw new IOException("Invalid metadata block.", exception);
+ }
+ }
+
+ private static WordTraversalDirection readTraversalDirection(final DataInputStream dataInput, final int version)
+ throws IOException {
+ if (version < TRAVERSAL_VERSION) {
+ return WordTraversalDirection.BACKWARD;
+ }
+ return readEnumByOrdinal(dataInput, WordTraversalDirection.values(), "traversal direction");
+ }
+
+ private static ReductionSettings readReductionSettings(final DataInputStream dataInput) throws IOException {
+ final ReductionMode reductionMode = readEnumByOrdinal(dataInput, ReductionMode.values(), "reduction mode");
+ final int dominantWinnerMinPercent = dataInput.readInt();
+ final int dominantWinnerOverSecondRatio = dataInput.readInt(); // NOPMD
+ return new ReductionSettings(reductionMode, dominantWinnerMinPercent, dominantWinnerOverSecondRatio);
+ }
+
+ private static CaseProcessingMode readCaseProcessingMode(final DataInputStream dataInput) throws IOException {
+ return readEnumByOrdinal(dataInput, CaseProcessingMode.values(), "case processing mode");
+ }
+
+ private static > E readEnumByOrdinal(final DataInputStream dataInput, final E[] values,
+ final String name) throws IOException {
+ final int ordinal = dataInput.readInt();
+ if (ordinal < 0 || ordinal >= values.length) {
+ throw new IOException("Invalid " + name + " ordinal: " + ordinal);
+ }
+ return values[ordinal];
+ }
+
+ private static CompiledNode[] readNodes(final DataInputStream dataInput, final IntFunction arrayFactory,
+ final ValueStreamCodec valueCodec, final int nodeCount, final int maxExpandedIndex) throws IOException {
+ final char[][] edgeLabelsByNode = new char[nodeCount][];
+ final int[][] childNodeIdsByNode = new int[nodeCount][];
+ @SuppressWarnings("unchecked")
+ final V[][] orderedValuesByNode = (V[][]) new Object[nodeCount][];
+ final int[][] orderedCountsByNode = new int[nodeCount][];
+
+ for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
+ final int edgeCount = dataInput.readInt();
+ if (edgeCount < 0) {
+ throw new IOException("Negative edge count at node " + nodeIndex + ": " + edgeCount);
+ }
+
+ edgeLabelsByNode[nodeIndex] = new char[edgeCount];
+ childNodeIdsByNode[nodeIndex] = new int[edgeCount];
+
+ for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
+ edgeLabelsByNode[nodeIndex][edgeIndex] = dataInput.readChar();
+ childNodeIdsByNode[nodeIndex][edgeIndex] = dataInput.readInt();
+ }
+
+ validateSerializedEdges(nodeIndex, edgeLabelsByNode[nodeIndex]);
+
+ final int valueCount = dataInput.readInt();
+ if (valueCount < 0) {
+ throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
+ }
+
+ orderedValuesByNode[nodeIndex] = arrayFactory.apply(valueCount);
+ orderedCountsByNode[nodeIndex] = new int[valueCount];
+
+ for (int valueIndex = 0; valueIndex < valueCount; valueIndex++) {
+ orderedValuesByNode[nodeIndex][valueIndex] = valueCodec.read(dataInput);
+ orderedCountsByNode[nodeIndex][valueIndex] = dataInput.readInt();
+ if (orderedCountsByNode[nodeIndex][valueIndex] <= 0) {
+ throw new IOException("Non-positive stored count at node " + nodeIndex + ", value index "
+ + valueIndex + ": " + orderedCountsByNode[nodeIndex][valueIndex]);
+ }
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ final CompiledNode[] nodes = new CompiledNode[nodeCount];
+ final boolean[] inProgress = new boolean[nodeCount];
+
+ for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
+ nodes[nodeIndex] = resolveNode(nodeIndex, edgeLabelsByNode, childNodeIdsByNode, orderedValuesByNode,
+ orderedCountsByNode, nodes, inProgress, maxExpandedIndex);
+ }
+
+ return nodes;
+ }
+
+ private static CompiledNode resolveNode(final int nodeIndex, final char[][] edgeLabelsByNode,
+ final int[][] childNodeIdsByNode, final V[][] orderedValuesByNode, final int[][] orderedCountsByNode,
+ final CompiledNode[] nodes, final boolean[] inProgress, final int maxExpandedIndex) throws IOException {
+ final CompiledNode cachedNode = nodes[nodeIndex];
+ if (cachedNode != null) {
+ return cachedNode;
+ }
+
+ if (inProgress[nodeIndex]) {
+ throw new IOException("Invalid serialized node graph: cyclic reference detected at node " + nodeIndex + '.');
+ }
+ inProgress[nodeIndex] = true;
+ try {
+ final char[] edgeLabels = edgeLabelsByNode[nodeIndex];
+ final int[] childNodeIds = childNodeIdsByNode[nodeIndex];
+ final int edgeCount = childNodeIds.length;
+ @SuppressWarnings("unchecked")
+ final CompiledNode[] children = new CompiledNode[edgeCount];
+
+ for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
+ final int childNodeId = childNodeIds[edgeIndex];
+ if (childNodeId < 0 || childNodeId >= edgeLabelsByNode.length) {
+ throw new IOException(
+ "Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex + ": "
+ + childNodeId);
+ }
+ children[edgeIndex] = resolveNode(childNodeId, edgeLabelsByNode, childNodeIdsByNode,
+ orderedValuesByNode, orderedCountsByNode, nodes, inProgress, maxExpandedIndex);
+ }
+
+ final CompiledNode node = new CompiledNode<>(edgeLabels, children, orderedValuesByNode[nodeIndex], maxExpandedIndex,
+ orderedCountsByNode[nodeIndex]);
+ nodes[nodeIndex] = node;
+ return node;
+ } finally {
+ inProgress[nodeIndex] = false;
+ }
+ }
+
+ private static void validateSerializedEdges(final int nodeIndex, final char... edgeLabels) throws IOException {
+ for (int edgeIndex = 1; edgeIndex < edgeLabels.length; edgeIndex++) {
+ if (edgeLabels[edgeIndex - 1] >= edgeLabels[edgeIndex]) {
+ throw new IOException("Edge labels must be strictly ascending at node " + nodeIndex + ", edge index "
+ + edgeIndex + ": '" + edgeLabels[edgeIndex - 1] + "' then '" + edgeLabels[edgeIndex] + "'.");
+ }
}
}
}
@@ -771,6 +843,16 @@ public final class FrequencyTrie {
*/
private final DiacriticProcessingMode diacriticProcessingMode;
+ /**
+ * Dense edge lookup span threshold.
+ *
+ * This value controls a speed/memory trade-off during freezing:
+ * dense child lookup tables are allocated only for nodes whose child
+ * labels fit in this span.
+ *
+ */
+ private final int maxExpandedIndex;
+
/**
* Mutable root node.
*/
@@ -837,11 +919,39 @@ public final class FrequencyTrie {
public Builder(final IntFunction arrayFactory, final ReductionSettings reductionSettings,
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
final DiacriticProcessingMode diacriticProcessingMode) {
+ this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, diacriticProcessingMode,
+ CompiledNode.DEFAULT_MAX_EXPANDED_INDEX);
+ }
+
+ /**
+ * Creates a new builder with the provided settings, explicit traversal
+ * direction, explicit case processing mode, explicit diacritic processing
+ * mode, and an explicit dense child lookup threshold.
+ *
+ * @param arrayFactory array factory
+ * @param reductionSettings reduction configuration
+ * @param traversalDirection logical key traversal direction
+ * @param caseProcessingMode dictionary case processing mode
+ * @param diacriticProcessingMode dictionary diacritic processing mode
+ * @param maxExpandedIndex dense lookup span override; zero disables
+ * dense lookup. Larger values increase direct
+ * indexing opportunities while potentially
+ * increasing materialization memory in nodes
+ * whose edge label span is within the limit.
+ * @throws NullPointerException if any argument is {@code null}
+ */
+ public Builder(final IntFunction arrayFactory, final ReductionSettings reductionSettings,
+ final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
+ final DiacriticProcessingMode diacriticProcessingMode, final int maxExpandedIndex) {
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
+ if (maxExpandedIndex < 0) {
+ throw new IllegalArgumentException("maxExpandedIndex must be non-negative.");
+ }
+ this.maxExpandedIndex = maxExpandedIndex;
this.root = new MutableNode<>();
}
@@ -1098,7 +1208,7 @@ public final class FrequencyTrie {
}
final CompiledNode frozen = new CompiledNode<>(edges, childNodes, localSummary.orderedValues(),
- localSummary.orderedCounts());
+ this.maxExpandedIndex, localSummary.orderedCounts());
cache.put(reducedNode, frozen);
return frozen;
}
diff --git a/src/main/java/org/egothor/stemmer/StemmerPatchTrieBinaryIO.java b/src/main/java/org/egothor/stemmer/StemmerPatchTrieBinaryIO.java
index 84ae1c9..8f8965a 100644
--- a/src/main/java/org/egothor/stemmer/StemmerPatchTrieBinaryIO.java
+++ b/src/main/java/org/egothor/stemmer/StemmerPatchTrieBinaryIO.java
@@ -94,6 +94,29 @@ public final class StemmerPatchTrieBinaryIO {
}
}
+ /**
+ * Reads a GZip-compressed binary patch-command trie from a filesystem path
+ * with an optional dense child lookup span override.
+ *
+ * This is a runtime-only tuning parameter. The dense-span setting is not
+ * persisted in the file and does not change the compiled metadata.
+ *
+ *
+ * @param path source file
+ * @param maxExpandedIndex dense lookup span override; negative values use
+ * {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
+ * @return deserialized trie
+ * @throws NullPointerException if {@code path} is {@code null}
+ * @throws IOException if reading or decompression fails
+ */
+ public static FrequencyTrie read(final Path path, final int maxExpandedIndex) throws IOException {
+ Objects.requireNonNull(path, "path");
+
+ try (InputStream fileInputStream = Files.newInputStream(path)) {
+ return read(fileInputStream, maxExpandedIndex);
+ }
+ }
+
/**
* Reads a GZip-compressed binary patch-command trie from a filesystem path
* string.
@@ -108,6 +131,26 @@ public final class StemmerPatchTrieBinaryIO {
return read(Path.of(fileName));
}
+ /**
+ * Reads a GZip-compressed binary patch-command trie from a filesystem path
+ * string with an optional dense child lookup span override.
+ *
+ * This is a runtime-only tuning parameter. The dense-span setting is not
+ * persisted in the file and does not change the compiled metadata.
+ *
+ *
+ * @param fileName source file name or path string
+ * @param maxExpandedIndex dense lookup span override; negative values use
+ * {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
+ * @return deserialized trie
+ * @throws NullPointerException if {@code fileName} is {@code null}
+ * @throws IOException if reading or decompression fails
+ */
+ public static FrequencyTrie read(final String fileName, final int maxExpandedIndex) throws IOException {
+ Objects.requireNonNull(fileName, "fileName");
+ return read(Path.of(fileName), maxExpandedIndex);
+ }
+
/**
* Reads a GZip-compressed binary patch-command trie from an input stream.
*
@@ -132,6 +175,34 @@ public final class StemmerPatchTrieBinaryIO {
}
}
+ /**
+ * Reads a GZip-compressed binary patch-command trie from an input stream with
+ * an optional dense child lookup span override.
+ *
+ * This is a runtime-only tuning parameter. The dense-span setting is not
+ * persisted in the file and does not change the compiled metadata.
+ *
+ *
+ * @param inputStream source stream
+ * @param maxExpandedIndex dense lookup span override; negative values use
+ * {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
+ * @return deserialized trie
+ * @throws NullPointerException if {@code inputStream} is {@code null}
+ * @throws IOException if reading or decompression fails
+ */
+ public static FrequencyTrie read(final InputStream inputStream, final int maxExpandedIndex) throws IOException {
+ Objects.requireNonNull(inputStream, "inputStream");
+
+ try (GZIPInputStream gzipInputStream = new GZIPInputStream(new BufferedInputStream(inputStream));
+ DataInputStream dataInputStream = new DataInputStream(gzipInputStream)) {
+ final FrequencyTrie trie = FrequencyTrie.readFrom(dataInputStream, String[]::new, STRING_CODEC,
+ maxExpandedIndex);
+
+ LOGGER.log(Level.FINE, "Read compressed binary stemmer trie.");
+ return trie;
+ }
+ }
+
/**
* Reads only metadata from a GZip-compressed binary patch-command trie stored
* at a filesystem path.
diff --git a/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java b/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java
index f9770fa..9e60b68 100644
--- a/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java
+++ b/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java
@@ -71,6 +71,7 @@ import java.util.zip.GZIPInputStream;
public final class StemmerPatchTrieLoader {
/* default */ static final String FILENAME_REQUIRED = "fileName required";
+ private static final String PARAMETER_PATH = "path";
/**
* Logger of this class.
@@ -460,8 +461,8 @@ public final class StemmerPatchTrieLoader {
public static FrequencyTrie load(final Path path, final boolean storeOriginal,
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
- throws IOException {
- Objects.requireNonNull(path, "path");
+ throws IOException {
+ Objects.requireNonNull(path, PARAMETER_PATH);
final TrieMetadata metadata = metadataForCompilation(traversalDirection, reductionSettings, caseProcessingMode,
diacriticProcessingMode);
return load(path, storeOriginal, metadata);
@@ -487,7 +488,7 @@ public final class StemmerPatchTrieLoader {
*/
public static FrequencyTrie load(final Path path, final boolean storeOriginal, final TrieMetadata metadata)
throws IOException {
- Objects.requireNonNull(path, "path");
+ Objects.requireNonNull(path, PARAMETER_PATH);
Objects.requireNonNull(metadata, "metadata");
try (InputStream inputStream = openDictionaryInputStream(path);
@@ -759,10 +760,31 @@ public final class StemmerPatchTrieLoader {
* read
*/
public static FrequencyTrie loadBinary(final Path path) throws IOException {
- Objects.requireNonNull(path, "path");
+ Objects.requireNonNull(path, PARAMETER_PATH);
return StemmerPatchTrieBinaryIO.read(path);
}
+ /**
+ * Loads a GZip-compressed binary patch-command trie from a filesystem path
+ * using a custom dense lookup span override.
+ *
+ * This is a runtime-only tuning parameter that does not affect persisted
+ * metadata.
+ *
+ *
+ * @param path path to the compressed binary trie file
+ * @param maxExpandedIndex dense lookup span override; negative values use
+ * {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
+ * @return compiled patch-command trie
+ * @throws NullPointerException if {@code path} is {@code null}
+ * @throws IOException if the file cannot be opened, decompressed, or
+ * read
+ */
+ public static FrequencyTrie loadBinary(final Path path, final int maxExpandedIndex) throws IOException {
+ Objects.requireNonNull(path, PARAMETER_PATH);
+ return StemmerPatchTrieBinaryIO.read(path, maxExpandedIndex);
+ }
+
/**
* Loads a GZip-compressed binary patch-command trie from a filesystem path
* string.
@@ -778,6 +800,27 @@ public final class StemmerPatchTrieLoader {
return StemmerPatchTrieBinaryIO.read(fileName);
}
+ /**
+ * Loads a GZip-compressed binary patch-command trie from a filesystem path
+ * string using a custom dense lookup span override.
+ *
+ * This is a runtime-only tuning parameter that does not affect persisted
+ * metadata.
+ *
+ *
+ * @param fileName file name or path string
+ * @param maxExpandedIndex dense lookup span override; negative values use
+ * {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
+ * @return compiled patch-command trie
+ * @throws NullPointerException if {@code fileName} is {@code null}
+ * @throws IOException if the file cannot be opened, decompressed, or
+ * read
+ */
+ public static FrequencyTrie loadBinary(final String fileName, final int maxExpandedIndex) throws IOException {
+ Objects.requireNonNull(fileName, FILENAME_REQUIRED);
+ return StemmerPatchTrieBinaryIO.read(fileName, maxExpandedIndex);
+ }
+
/**
* Loads a GZip-compressed binary patch-command trie from an input stream.
*
@@ -802,7 +845,7 @@ public final class StemmerPatchTrieLoader {
* read
*/
public static TrieMetadata loadBinaryMetadata(final Path path) throws IOException {
- Objects.requireNonNull(path, "path");
+ Objects.requireNonNull(path, PARAMETER_PATH);
return StemmerPatchTrieBinaryIO.readMetadata(path);
}
@@ -845,7 +888,7 @@ public final class StemmerPatchTrieLoader {
*/
public static void saveBinary(final FrequencyTrie trie, final Path path) throws IOException {
Objects.requireNonNull(trie, "trie");
- Objects.requireNonNull(path, "path");
+ Objects.requireNonNull(path, PARAMETER_PATH);
StemmerPatchTrieBinaryIO.write(trie, path);
}
diff --git a/src/main/java/org/egothor/stemmer/trie/CompiledNode.java b/src/main/java/org/egothor/stemmer/trie/CompiledNode.java
index faff909..c48d795 100644
--- a/src/main/java/org/egothor/stemmer/trie/CompiledNode.java
+++ b/src/main/java/org/egothor/stemmer/trie/CompiledNode.java
@@ -1,21 +1,21 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
- *
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
- *
+ *
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
- *
+ *
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
- *
+ *
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
- *
+ *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -43,14 +43,15 @@ import java.util.Objects;
* immutable from the public API perspective because construction wires these
* arrays once and all lookup operations thereafter treat them as read-only.
*
- * @param value type
- * @param edgeLabels internal edge label array
- * @param children internal child array
- * @param orderedValues internal ordered values array
- * @param orderedCounts internal ordered counts array
+ * @param value type
*/
-@SuppressWarnings("PMD.DataClass")
-public record CompiledNode(char[] edgeLabels, CompiledNode[] children, V[] orderedValues, int... orderedCounts) {
+public final class CompiledNode {
+
+ /**
+ * Default dense child lookup span in characters used when an explicit override is
+ * not provided.
+ */
+ public static final int DEFAULT_MAX_EXPANDED_INDEX = 512;
/**
* Number of child edges where linear scan is cheaper than binary search.
@@ -58,24 +59,112 @@ public record CompiledNode(char[] edgeLabels, CompiledNode[] children, V[]
private static final int LINEAR_CHILD_COUNT_THRESHOLD = 4;
/**
- * Creates one validated compiled node.
+ * Edge labels in sorted ascending order.
+ */
+ private final char[] edgeLabels;
+
+ /**
+ * Sparse child array aligned with {@link #edgeLabels}.
+ */
+ private final CompiledNode[] children;
+
+ /**
+ * Dense child lookup table used when labels fit into a compact char interval.
+ *
+ * The table enables direct O(1) indexing for child lookup and is allocated
+ * only when the character span of this node's edges is within the configured
+ * threshold.
+ *
+ */
+ private final CompiledNode[] denseChildren;
+
+ /**
+ * Normalized minimum edge value for the dense lookup table.
+ */
+ private final int denseEdgeMin;
+
+ /**
+ * Values stored at this node in local order.
+ */
+ private final V[] orderedValues;
+
+ /**
+ * Occurrence counts aligned with {@link #orderedValues}.
+ */
+ private final int[] orderedCounts;
+
+ /**
+ * Creates one validated compiled node using {@link #DEFAULT_MAX_EXPANDED_INDEX}
+ * for dense lookup sizing.
*
* @throws NullPointerException if any array argument is {@code null}
* @throws IllegalArgumentException if the edge-related arrays or value-related
* arrays do not have matching lengths
*/
- public CompiledNode {
+ public CompiledNode(final char[] edgeLabels, final CompiledNode[] children, final V[] orderedValues,
+ final int... orderedCounts) {
+ this(edgeLabels, children, orderedValues, DEFAULT_MAX_EXPANDED_INDEX, orderedCounts);
+ }
+
+ /**
+ * Creates one validated compiled node.
+ *
+ * @param maxExpandedIndex upper bound for the dense lookup interval size; zero
+ * disables dense lookup. Larger values improve
+ * direct-index likelihood while increasing dense
+ * table memory in compact-label nodes.
+ * @throws NullPointerException if any array argument is {@code null}
+ * @throws IllegalArgumentException if the edge-related arrays or value-related
+ * arrays do not have matching lengths or the
+ * dense interval size is negative
+ */
+ public CompiledNode(final char[] edgeLabels, final CompiledNode[] children, final V[] orderedValues,
+ final int maxExpandedIndex, final int... orderedCounts) {
Objects.requireNonNull(edgeLabels, "edgeLabels");
Objects.requireNonNull(children, "children");
Objects.requireNonNull(orderedValues, "orderedValues");
Objects.requireNonNull(orderedCounts, "orderedCounts");
+ if (maxExpandedIndex < 0) {
+ throw new IllegalArgumentException("maxExpandedIndex must be non-negative.");
+ }
+
if (edgeLabels.length != children.length) {
throw new IllegalArgumentException("edgeLabels and children must have the same length.");
}
if (orderedValues.length != orderedCounts.length) {
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
}
+
+ this.edgeLabels = edgeLabels;
+ this.children = children;
+ this.orderedValues = orderedValues;
+ this.orderedCounts = orderedCounts;
+
+ if (edgeLabels.length == 0 || maxExpandedIndex == 0) {
+ this.denseChildren = null;
+ this.denseEdgeMin = 0;
+ return;
+ }
+
+ final int minEdge = edgeLabels[0];
+ final int maxEdge = edgeLabels[edgeLabels.length - 1];
+ final int span = maxEdge - minEdge;
+
+ if (span < 0 || span > maxExpandedIndex) {
+ this.denseChildren = null;
+ this.denseEdgeMin = 0;
+ return;
+ }
+
+ @SuppressWarnings("unchecked")
+ final CompiledNode[] dense = (CompiledNode[]) new CompiledNode[span + 1];
+ for (int edgeIndex = 0; edgeIndex < edgeLabels.length; edgeIndex++) {
+ dense[edgeLabels[edgeIndex] - minEdge] = children[edgeIndex];
+ }
+
+ this.denseChildren = dense;
+ this.denseEdgeMin = minEdge;
}
/**
@@ -87,7 +176,6 @@ public record CompiledNode(char[] edgeLabels, CompiledNode[] children, V[]
*
* @return internal edge-label array
*/
- @Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public char[] edgeLabels() {
return this.edgeLabels;
@@ -102,7 +190,6 @@ public record CompiledNode(char[] edgeLabels, CompiledNode[] children, V[]
*
* @return internal child-node array
*/
- @Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public CompiledNode[] children() {
return this.children;
@@ -117,7 +204,6 @@ public record CompiledNode(char[] edgeLabels, CompiledNode[] children, V[]
*
* @return internal ordered-values array
*/
- @Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public V[] orderedValues() {
return this.orderedValues;
@@ -132,14 +218,143 @@ public record CompiledNode(char[] edgeLabels, CompiledNode[] children, V[]
*
* @return internal ordered-counts array
*/
- @Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public int[] orderedCounts() {
return this.orderedCounts;
}
+ /**
+ * Returns the number of child edges represented by this node.
+ *
+ * @return child edge count
+ */
+ public int edgeCount() {
+ return this.edgeLabels.length;
+ }
+
+ /**
+ * Returns the number of values stored in this node.
+ *
+ * @return value count
+ */
+ public int valueCount() {
+ return this.orderedValues.length;
+ }
+
+ /**
+ * Indicates whether this node stores any values.
+ *
+ * @return {@code true} when values are present at this node
+ */
+ public boolean hasValues() {
+ return this.orderedValues.length > 0;
+ }
+
+ /**
+ * Indicates whether this node has child edges.
+ *
+ * @return {@code true} when this node has at least one outgoing edge
+ */
+ public boolean hasChildren() {
+ return this.edgeLabels.length > 0;
+ }
+
+ /**
+ * Indicates whether this node has no child edges.
+ *
+ * @return {@code true} when this node is a terminal leaf node
+ */
+ public boolean isLeaf() {
+ return !hasChildren();
+ }
+
+ /**
+ * Tests whether an edge label is present at this node.
+ *
+ * @param edge edge label
+ * @return {@code true} if this node contains the supplied edge label
+ */
+ public boolean hasEdge(final char edge) {
+ return findChild(edge) != null;
+ }
+
+ /**
+ * Indicates whether this node has a dense direct-index child lookup table.
+ *
+ * @return {@code true} when a direct-index child table is available
+ */
+ public boolean hasDenseLookup() {
+ return this.denseChildren != null;
+ }
+
+ /**
+ * Returns a small memory-related metric describing this node's dense table size.
+ *
+ * @return number of dense table slots, or {@code 0} when dense lookup is not
+ * enabled
+ */
+ public int denseTableLength() {
+ return this.denseChildren == null ? 0 : this.denseChildren.length;
+ }
+
+ /**
+ * Returns a compact structural summary used by diagnostics and tests.
+ *
+ * @return summary hash for node structure and contents
+ */
+ @Override
+ public int hashCode() {
+ int hash = Arrays.hashCode(this.edgeLabels);
+ hash = 31 * hash + Arrays.hashCode(this.children);
+ hash = 31 * hash + Arrays.hashCode(this.orderedValues);
+ hash = 31 * hash + Arrays.hashCode(this.orderedCounts);
+ hash = 31 * hash + Objects.hash(this.denseEdgeMin);
+ hash = 31 * hash + (hasDenseLookup() ? Arrays.hashCode(this.denseChildren) : 0);
+ return hash;
+ }
+
+ /**
+ * Compares structural node content, including dense table availability.
+ *
+ * @param object comparison object
+ * @return {@code true} when nodes describe identical structure and payload
+ */
+ @Override
+ public boolean equals(final Object object) {
+ if (this == object) {
+ return true;
+ }
+ if (!(object instanceof CompiledNode> other)) {
+ return false;
+ }
+ return Arrays.equals(this.edgeLabels, other.edgeLabels) && Arrays.equals(this.children, other.children)
+ && Arrays.equals(this.orderedValues, other.orderedValues) && Arrays.equals(this.orderedCounts, other.orderedCounts)
+ && this.denseEdgeMin == other.denseEdgeMin && Arrays.equals(this.denseChildren, other.denseChildren);
+ }
+
+ /**
+ * Returns a short summary useful for debugging and diagnostics.
+ *
+ * @return textual node summary
+ */
+ @Override
+ public String toString() {
+ return "CompiledNode{"
+ + "edgeCount=" + this.edgeLabels.length + ", orderedValueCount=" + this.orderedValues.length
+ + ", denseTableLength=" + denseTableLength() + '}';
+ }
+
/**
* Finds a child for the supplied edge character.
+ *
+ * Lookup order is:
+ *
+ * - dense array index (if the label interval is compact enough),
+ * - small-child linear scan when the fallback node has {@value #LINEAR_CHILD_COUNT_THRESHOLD}
+ * or fewer edges,
+ * - binary search over sorted labels.
+ *
+ *
*
* @param edge edge character
* @return child node, or {@code null} if absent
@@ -149,6 +364,15 @@ public record CompiledNode(char[] edgeLabels, CompiledNode[] children, V[]
if (childCount == 0) {
return null;
}
+
+ if (this.denseChildren != null) {
+ final int denseIndex = edge - this.denseEdgeMin;
+ if (denseIndex < 0 || denseIndex >= this.denseChildren.length) {
+ return null;
+ }
+ return this.denseChildren[denseIndex];
+ }
+
if (childCount <= LINEAR_CHILD_COUNT_THRESHOLD) {
for (int index = 0; index < childCount; index++) {
if (this.edgeLabels[index] == edge) {
diff --git a/src/test/java/org/egothor/stemmer/CompileIntegrationTest.java b/src/test/java/org/egothor/stemmer/CompileIntegrationTest.java
index 0556477..48bd5ba 100644
--- a/src/test/java/org/egothor/stemmer/CompileIntegrationTest.java
+++ b/src/test/java/org/egothor/stemmer/CompileIntegrationTest.java
@@ -95,6 +95,8 @@ import org.junit.jupiter.params.provider.MethodSource;
@Tag("integration")
@Tag("cli")
@Tag("stemmer")
+@Tag("compile")
+@Tag("slow")
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
@DisplayName("Compile integration")
final class CompileIntegrationTest {
@@ -189,9 +191,10 @@ final class CompileIntegrationTest {
* create nested output directories, preserve expected lookup behavior, and
* store canonical stems when {@code --store-original} is enabled.
*
- * @throws IOException if reading or writing fails
+ * @throws IOException if reading or writing fails
*/
@Test
+ @Tag("slow")
@DisplayName("CLI should compile the remark-aware fixture and preserve expected lookups")
void shouldCompileRemarkAwareFixtureAndPreserveExpectedLookups() throws IOException {
final Path inputFile = copyResourceToTemporaryFile(REMARK_AWARE_DICTIONARY_RESOURCE,
@@ -234,9 +237,10 @@ final class CompileIntegrationTest {
* Verifies that the CLI rejects an already existing output path unless
* overwrite is explicitly enabled.
*
- * @throws IOException if reading or writing fails
+ * @throws IOException if reading or writing fails
*/
@Test
+ @Tag("slow")
@DisplayName("CLI should require overwrite before replacing an existing output artifact")
void shouldRequireOverwriteForExistingOutput() throws IOException {
final Path inputFile = copyResourceToTemporaryFile(REMARK_AWARE_DICTIONARY_RESOURCE,
@@ -301,6 +305,7 @@ final class CompileIntegrationTest {
@Nested
@DisplayName("Bundled project dictionary workflows")
+ @Tag("slow")
final class BundledProjectDictionaryWorkflows {
/**
@@ -317,11 +322,12 @@ final class CompileIntegrationTest {
*
*
* @param scenario scenario identifier
- * @param resourcePath bundled dictionary resource path
+ * @param resourcePath bundled dictionary resource path
* @throws IOException if reading or writing fails
*/
@ParameterizedTest(name = "[{index}] {0}")
@MethodSource("org.egothor.stemmer.CompileIntegrationTest#bundledDictionaryCases")
+ @Tag("slow")
@DisplayName("CLI should compile bundled project dictionaries and preserve representative variant semantics")
void shouldCompileBundledProjectDictionaryAndPreserveRepresentativeVariantSemantics(final String scenario,
final String resourcePath) throws IOException {
diff --git a/src/test/java/org/egothor/stemmer/CompileTest.java b/src/test/java/org/egothor/stemmer/CompileTest.java
index 2cd66e4..0069457 100644
--- a/src/test/java/org/egothor/stemmer/CompileTest.java
+++ b/src/test/java/org/egothor/stemmer/CompileTest.java
@@ -66,7 +66,10 @@ import org.junit.jupiter.api.io.TempDir;
* {@link System#exit(int)}.
*
*/
-@Tag("unit")
+@Tag("integration")
+@Tag("cli")
+@Tag("compile")
+@Tag("stemmer")
@DisplayName("Compile")
class CompileTest {
diff --git a/src/test/java/org/egothor/stemmer/CompiledTrieArtifactRegressionTest.java b/src/test/java/org/egothor/stemmer/CompiledTrieArtifactRegressionTest.java
index 8f178df..271f568 100644
--- a/src/test/java/org/egothor/stemmer/CompiledTrieArtifactRegressionTest.java
+++ b/src/test/java/org/egothor/stemmer/CompiledTrieArtifactRegressionTest.java
@@ -70,10 +70,11 @@ import org.junit.jupiter.params.provider.MethodSource;
* compressed artifact reproducibility within the active format version
*
*/
-@Tag("unit")
+@Tag("compat")
@Tag("regression")
@Tag("determinism")
@Tag("serialization")
+@Tag("trie")
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
final class CompiledTrieArtifactRegressionTest {
diff --git a/src/test/java/org/egothor/stemmer/DiacriticStripperTest.java b/src/test/java/org/egothor/stemmer/DiacriticStripperTest.java
index 66a8086..6f87781 100644
--- a/src/test/java/org/egothor/stemmer/DiacriticStripperTest.java
+++ b/src/test/java/org/egothor/stemmer/DiacriticStripperTest.java
@@ -41,7 +41,8 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link DiacriticStripper}.
*/
@Tag("unit")
-@Tag("diacritics")
+@Tag("diacritic")
+@Tag("stemmer")
@DisplayName("DiacriticStripper")
class DiacriticStripperTest {
diff --git a/src/test/java/org/egothor/stemmer/FrequencyTrieBuildersTest.java b/src/test/java/org/egothor/stemmer/FrequencyTrieBuildersTest.java
index 51a23c0..c850346 100644
--- a/src/test/java/org/egothor/stemmer/FrequencyTrieBuildersTest.java
+++ b/src/test/java/org/egothor/stemmer/FrequencyTrieBuildersTest.java
@@ -59,7 +59,7 @@ import org.junit.jupiter.api.Test;
*/
@DisplayName("FrequencyTrieBuilders")
@Tag("unit")
-@Tag("builder")
+@Tag("construction")
@Tag("frequency-trie")
class FrequencyTrieBuildersTest {
diff --git a/src/test/java/org/egothor/stemmer/FrequencyTrieProperties.java b/src/test/java/org/egothor/stemmer/FrequencyTrieProperties.java
index e0ef4a9..00e898c 100644
--- a/src/test/java/org/egothor/stemmer/FrequencyTrieProperties.java
+++ b/src/test/java/org/egothor/stemmer/FrequencyTrieProperties.java
@@ -47,7 +47,7 @@ import java.util.List;
import net.jqwik.api.ForAll;
import net.jqwik.api.Label;
import net.jqwik.api.Property;
-import net.jqwik.api.Tag;
+import org.junit.jupiter.api.Tag;
/**
* Property-based tests for the compiled trie abstraction.
@@ -59,9 +59,9 @@ import net.jqwik.api.Tag;
* core algorithm without overfitting to particular fixture data.
*/
@Label("FrequencyTrie properties")
-@Tag("unit")
@Tag("property")
@Tag("trie")
+@Tag("frequency-trie")
class FrequencyTrieProperties extends PropertyBasedTestSupport {
/**
diff --git a/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java b/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java
index 6835799..7afbe15 100644
--- a/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java
+++ b/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java
@@ -33,6 +33,7 @@ package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertSame;
@@ -379,6 +380,24 @@ class FrequencyTrieTest {
assertThrows(UnsupportedOperationException.class, () -> entries.add(new ValueCount("z", 1)));
}
+ /**
+ * Verifies that {@link FrequencyTrie#getEntries(String)} short-circuits to a one-item immutable list.
+ */
+ @Test
+ @DisplayName("getEntries returns a one-item list for single stored values")
+ void getEntriesReturnsSingleItemListForSingleStoredValue() {
+ final FrequencyTrie.Builder builder = rankedBuilder();
+
+ builder.put("gamma", "only");
+
+ final FrequencyTrie trie = builder.build();
+
+ final List> entries = trie.getEntries("gamma");
+
+ assertAll(() -> assertEquals(List.of(new ValueCount("only", 1)), entries),
+ () -> assertThrows(UnsupportedOperationException.class, () -> entries.add(new ValueCount("z", 1))));
+ }
+
/**
* Verifies that equal frequencies prefer the shorter string representation.
*/
@@ -755,6 +774,115 @@ class FrequencyTrieTest {
.readFrom(new ByteArrayInputStream(serializedEmptyTrie), String[]::new, null)));
}
+ /**
+ * Verifies that reading a compiled trie with a negative max-expanded override
+ * smaller than -1 is rejected.
+ */
+ @Test
+ @Tag("persistence")
+ @DisplayName("readFrom rejects invalid maxExpandedIndex override")
+ void readFromRejectsInvalidMaxExpandedIndexOverride() {
+ final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
+ dataOutput.writeInt(0);
+ dataOutput.writeInt(0);
+ } });
+
+ final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
+ () -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC, -2));
+
+ assertEquals("maxExpandedIndex must be >= -1.", exception.getMessage());
+ }
+
+ /**
+ * Verifies that the max-expanded override controls dense lookup materialization
+ * while preserving lookup semantics.
+ */
+ @Test
+ @Tag("persistence")
+ @DisplayName("readFrom respects dense lookup max-expanded index override")
+ void readFromRespectsDenseLookupMaxExpandedIndexOverride() throws IOException {
+ final FrequencyTrie.Builder builder = rankedBuilder();
+
+ builder.put("a", "a");
+ builder.put("b", "b");
+ builder.put("c", "c");
+ builder.put("d", "d");
+
+ final FrequencyTrie original = builder.build();
+ final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+ original.writeTo(outputStream, STRING_CODEC);
+ final byte[] serializedTrie = outputStream.toByteArray();
+
+ final FrequencyTrie defaultDense = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie), String[]::new,
+ STRING_CODEC);
+ final FrequencyTrie defaultDenseByNegative = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie),
+ String[]::new, STRING_CODEC, -1);
+ final FrequencyTrie disabledDense = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie), String[]::new,
+ STRING_CODEC, 0);
+
+ assertAll(
+ () -> assertTrue(defaultDense.root().hasDenseLookup(),
+ "Default read should enable dense lookup for compact first-level edges."),
+ () -> assertTrue(defaultDenseByNegative.root().hasDenseLookup(),
+ "Negative override should use the default dense lookup span."),
+ () -> assertFalse(disabledDense.root().hasDenseLookup(),
+ "Zero override should disable dense lookup tables."),
+ () -> assertEquals(original.get("a"), disabledDense.get("a")),
+ () -> assertEquals(original.get("b"), disabledDense.get("b")),
+ () -> assertEquals(original.get("c"), disabledDense.get("c")),
+ () -> assertEquals(original.get("d"), disabledDense.get("d")),
+ () -> assertEquals(original.get("z"), disabledDense.get("z")));
+ }
+
+ /**
+ * Verifies that cyclic serialized node references are rejected as invalid
+ * serialization.
+ */
+ @Test
+ @Tag("persistence")
+ @DisplayName("readFrom rejects cyclic serialized node references")
+ void readFromRejectsCyclicSerializedNodeReferences() {
+ final byte[] bytes = createSerializedStream(0x45475452, 1, 2, 0, new NodeWriter[] {
+ dataOutput -> {
+ dataOutput.writeInt(1);
+ dataOutput.writeChar('b');
+ dataOutput.writeInt(1);
+ dataOutput.writeInt(0);
+ },
+ dataOutput -> {
+ dataOutput.writeInt(1);
+ dataOutput.writeChar('a');
+ dataOutput.writeInt(0);
+ dataOutput.writeInt(0);
+ } });
+
+ final IOException exception = assertThrows(IOException.class,
+ () -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
+
+ assertTrue(exception.getMessage().contains("cyclic reference detected"));
+ }
+
+ /**
+ * Verifies that child node references outside the valid serialized range are
+ * rejected.
+ */
+ @Test
+ @Tag("persistence")
+ @DisplayName("readFrom rejects invalid child node identifiers")
+ void readFromRejectsInvalidChildNodeId() {
+ final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
+ dataOutput.writeInt(1);
+ dataOutput.writeChar('a');
+ dataOutput.writeInt(3);
+ dataOutput.writeInt(0);
+ } });
+
+ final IOException exception = assertThrows(IOException.class,
+ () -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
+
+ assertTrue(exception.getMessage().contains("Invalid child node id"));
+ }
+
/**
* Verifies that deserialization rejects an invalid stream magic header.
*/
@@ -785,6 +913,27 @@ class FrequencyTrieTest {
assertTrue(exception.getMessage().contains("Unsupported trie stream version"));
}
+ /**
+ * Verifies that the latest stream version validates textual metadata blocks.
+ */
+ @Test
+ @Tag("persistence")
+ @DisplayName("readFrom rejects invalid textual metadata block")
+ void readFromRejectsInvalidTextualMetadataBlock() {
+ final int version = FrequencyTrie.currentFormatVersion();
+ final byte[] bytes = createSerializedStream(0x45475452, version, 1, 0, dataOutput -> {
+ dataOutput.writeUTF("not valid metadata");
+ }, new NodeWriter[] { dataOutput -> {
+ dataOutput.writeInt(0);
+ dataOutput.writeInt(0);
+ } });
+
+ final IOException exception = assertThrows(IOException.class,
+ () -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
+
+ assertTrue(exception.getMessage().contains("Invalid metadata block"));
+ }
+
/**
* Verifies that deserialization rejects a negative node count.
*/
@@ -862,6 +1011,129 @@ class FrequencyTrieTest {
assertTrue(exception.getMessage().contains("Non-positive stored count"));
}
+ /**
+ * Verifies that legacy version 1 metadata uses compatibility defaults.
+ */
+ @Test
+ @Tag("persistence")
+ @DisplayName("readFrom supports legacy version 1 metadata")
+ void readFromSupportsLegacyVersionOneMetadata() throws IOException {
+ final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
+ dataOutput.writeInt(0);
+ dataOutput.writeInt(0);
+ } });
+
+ final FrequencyTrie trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
+
+ assertEquals(TrieMetadata.legacy(1, WordTraversalDirection.BACKWARD), trie.metadata());
+ }
+
+ /**
+ * Verifies that legacy version 2 metadata stores traversal direction and uses
+ * compatibility defaults for other values.
+ */
+ @Test
+ @Tag("persistence")
+ @DisplayName("readFrom supports legacy version 2 metadata")
+ void readFromSupportsLegacyVersionTwoMetadata() throws IOException {
+ final byte[] bytes = createSerializedStream(0x45475452, 2, 1, 0,
+ dataOutput -> dataOutput.writeInt(WordTraversalDirection.FORWARD.ordinal()), new NodeWriter[] { dataOutput -> {
+ dataOutput.writeInt(0);
+ dataOutput.writeInt(0);
+ } });
+
+ final FrequencyTrie trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
+
+ assertEquals(TrieMetadata.legacy(2, WordTraversalDirection.FORWARD), trie.metadata());
+ }
+
+ /**
+ * Verifies that version 3 metadata includes reduction and diacritic
+ * processing settings.
+ */
+ @Test
+ @Tag("persistence")
+ @DisplayName("readFrom parses version 3 metadata")
+ void readFromParsesVersionThreeMetadata() throws IOException {
+ final ReductionSettings reductionSettings = new ReductionSettings(
+ ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS, 81, 4);
+
+ final byte[] bytes = createSerializedStream(0x45475452, 3, 1, 0,
+ dataOutput -> {
+ dataOutput.writeInt(WordTraversalDirection.BACKWARD.ordinal());
+ dataOutput.writeInt(reductionSettings.reductionMode().ordinal());
+ dataOutput.writeInt(reductionSettings.dominantWinnerMinPercent());
+ dataOutput.writeInt(reductionSettings.dominantWinnerOverSecondRatio());
+ dataOutput.writeInt(DiacriticProcessingMode.REMOVE.ordinal());
+ },
+ new NodeWriter[] { dataOutput -> {
+ dataOutput.writeInt(0);
+ dataOutput.writeInt(0);
+ } });
+
+ final FrequencyTrie trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
+ final TrieMetadata metadata = trie.metadata();
+
+ assertAll(() -> assertEquals(3, metadata.formatVersion()),
+ () -> assertEquals(WordTraversalDirection.BACKWARD, metadata.traversalDirection()),
+ () -> assertEquals(reductionSettings, metadata.reductionSettings()),
+ () -> assertEquals(DiacriticProcessingMode.REMOVE, metadata.diacriticProcessingMode()),
+ () -> assertEquals(CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, metadata.caseProcessingMode()));
+ }
+
+ /**
+ * Verifies that version 4 metadata additionally stores case-processing mode.
+ */
+ @Test
+ @Tag("persistence")
+ @DisplayName("readFrom parses version 4 case processing metadata")
+ void readFromParsesVersionFourCaseMetadata() throws IOException {
+ final ReductionSettings reductionSettings = new ReductionSettings(
+ ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, 75, 3);
+
+ final byte[] bytes = createSerializedStream(0x45475452, 4, 1, 0,
+ dataOutput -> {
+ dataOutput.writeInt(WordTraversalDirection.FORWARD.ordinal());
+ dataOutput.writeInt(reductionSettings.reductionMode().ordinal());
+ dataOutput.writeInt(reductionSettings.dominantWinnerMinPercent());
+ dataOutput.writeInt(reductionSettings.dominantWinnerOverSecondRatio());
+ dataOutput.writeInt(DiacriticProcessingMode.AS_IS.ordinal());
+ dataOutput.writeInt(CaseProcessingMode.AS_IS.ordinal());
+ },
+ new NodeWriter[] { dataOutput -> {
+ dataOutput.writeInt(0);
+ dataOutput.writeInt(0);
+ } });
+
+ final FrequencyTrie trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
+ final TrieMetadata metadata = trie.metadata();
+
+ assertAll(() -> assertEquals(4, metadata.formatVersion()),
+ () -> assertEquals(WordTraversalDirection.FORWARD, metadata.traversalDirection()),
+ () -> assertEquals(reductionSettings, metadata.reductionSettings()),
+ () -> assertEquals(DiacriticProcessingMode.AS_IS, metadata.diacriticProcessingMode()),
+ () -> assertEquals(CaseProcessingMode.AS_IS, metadata.caseProcessingMode()));
+ }
+
+ /**
+ * Verifies that invalid legacy metadata ordinals are rejected by validation.
+ */
+ @Test
+ @Tag("persistence")
+ @DisplayName("readFrom rejects invalid metadata ordinal in legacy stream")
+ void readFromRejectsInvalidLegacyMetadataOrdinal() {
+ final byte[] bytes = createSerializedStream(0x45475452, 2, 1, 0,
+ dataOutput -> dataOutput.writeInt(999), new NodeWriter[] { dataOutput -> {
+ dataOutput.writeInt(0);
+ dataOutput.writeInt(0);
+ } });
+
+ final IOException exception = assertThrows(IOException.class,
+ () -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
+
+ assertTrue(exception.getMessage().contains("Invalid traversal direction ordinal"));
+ }
+
/**
* Writes one node body into a synthetic serialized trie stream.
*/
@@ -889,6 +1161,24 @@ class FrequencyTrieTest {
*/
private static byte[] createSerializedStream(final int magic, final int version, final int nodeCount,
final int rootNodeId, final NodeWriter[] nodes) {
+ return createSerializedStream(magic, version, nodeCount, rootNodeId, dataOutput -> {
+ // legacy and text-based versions write their metadata differently.
+ }, nodes);
+ }
+
+ /**
+ * Writes a synthetic serialized trie stream with a metadata writer hook.
+ *
+ * @param magic stream magic
+ * @param version stream version
+ * @param nodeCount declared node count
+ * @param rootNodeId declared root node identifier
+ * @param metadata version-specific metadata writer
+ * @param nodes node body writers
+ * @return serialized bytes
+ */
+ private static byte[] createSerializedStream(final int magic, final int version, final int nodeCount,
+ final int rootNodeId, final MetadataWriter metadata, final NodeWriter[] nodes) {
try {
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
final DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream);
@@ -897,6 +1187,7 @@ class FrequencyTrieTest {
dataOutputStream.writeInt(version);
dataOutputStream.writeInt(nodeCount);
dataOutputStream.writeInt(rootNodeId);
+ metadata.write(dataOutputStream);
for (NodeWriter node : nodes) {
node.write(dataOutputStream);
@@ -908,4 +1199,19 @@ class FrequencyTrieTest {
throw new IllegalStateException("Unexpected I/O while building synthetic trie stream.", exception);
}
}
+
+ /**
+ * Writes one synthetic metadata block.
+ */
+ @FunctionalInterface
+ private interface MetadataWriter {
+
+ /**
+ * Writes metadata bytes for one stream version.
+ *
+ * @param dataOutput output stream
+ * @throws IOException if writing fails
+ */
+ void write(DataOutputStream dataOutput) throws IOException;
+ }
}
diff --git a/src/test/java/org/egothor/stemmer/FuzzStemmerAndTrieCompilationTest.java b/src/test/java/org/egothor/stemmer/FuzzStemmerAndTrieCompilationTest.java
index 933a749..a0e11b0 100644
--- a/src/test/java/org/egothor/stemmer/FuzzStemmerAndTrieCompilationTest.java
+++ b/src/test/java/org/egothor/stemmer/FuzzStemmerAndTrieCompilationTest.java
@@ -65,10 +65,9 @@ import org.junit.jupiter.api.io.TempDir;
* stems declared by the source dictionary.
*/
@DisplayName("Deterministic fuzz-style trie and stemmer compilation")
-@Tag("unit")
@Tag("fuzz")
@Tag("trie")
-@Tag("stemming")
+@Tag("stemmer")
class FuzzStemmerAndTrieCompilationTest {
/**
diff --git a/src/test/java/org/egothor/stemmer/PatchCommandEncoderProperties.java b/src/test/java/org/egothor/stemmer/PatchCommandEncoderProperties.java
index 65655c9..1c409b6 100644
--- a/src/test/java/org/egothor/stemmer/PatchCommandEncoderProperties.java
+++ b/src/test/java/org/egothor/stemmer/PatchCommandEncoderProperties.java
@@ -36,7 +36,7 @@ import static org.junit.jupiter.api.Assertions.assertNotNull;
import net.jqwik.api.ForAll;
import net.jqwik.api.Label;
import net.jqwik.api.Property;
-import net.jqwik.api.Tag;
+import org.junit.jupiter.api.Tag;
/**
* Property-based tests for {@link PatchCommandEncoder}.
@@ -47,9 +47,9 @@ import net.jqwik.api.Tag;
* reconstruct the exact requested target.
*/
@Label("PatchCommandEncoder properties")
-@Tag("unit")
@Tag("property")
@Tag("patch")
+@Tag("stemmer")
class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
/**
diff --git a/src/test/java/org/egothor/stemmer/PatchCommandEncoderTest.java b/src/test/java/org/egothor/stemmer/PatchCommandEncoderTest.java
index c8decdc..b608ef7 100644
--- a/src/test/java/org/egothor/stemmer/PatchCommandEncoderTest.java
+++ b/src/test/java/org/egothor/stemmer/PatchCommandEncoderTest.java
@@ -241,7 +241,7 @@ class PatchCommandEncoderTest {
*/
@Nested
@DisplayName("construction")
- @Tag("constructor")
+ @Tag("construction")
class ConstructionTests {
/**
@@ -326,7 +326,7 @@ class PatchCommandEncoderTest {
*/
@Nested
@DisplayName("encode(String, String)")
- @Tag("encode")
+ @Tag("encoding")
class EncodeTests {
/**
@@ -658,7 +658,7 @@ class PatchCommandEncoderTest {
*/
@Nested
@DisplayName("reversed-word processing")
- @Tag("reverse")
+ @Tag("normalization")
class ReversedWordProcessingTests {
/**
diff --git a/src/test/java/org/egothor/stemmer/StemmerDictionaryParserTest.java b/src/test/java/org/egothor/stemmer/StemmerDictionaryParserTest.java
index 353c29c..bbf9267 100644
--- a/src/test/java/org/egothor/stemmer/StemmerDictionaryParserTest.java
+++ b/src/test/java/org/egothor/stemmer/StemmerDictionaryParserTest.java
@@ -75,6 +75,7 @@ import org.junit.jupiter.api.io.TempDir;
@DisplayName("StemmerDictionaryParser")
@Tag("unit")
@Tag("parser")
+@Tag("stemmer")
class StemmerDictionaryParserTest {
/**
diff --git a/src/test/java/org/egothor/stemmer/StemmerKnowledgeExperimentTest.java b/src/test/java/org/egothor/stemmer/StemmerKnowledgeExperimentTest.java
index 7a233d8..ad2031f 100644
--- a/src/test/java/org/egothor/stemmer/StemmerKnowledgeExperimentTest.java
+++ b/src/test/java/org/egothor/stemmer/StemmerKnowledgeExperimentTest.java
@@ -54,9 +54,9 @@ import org.junit.jupiter.api.io.TempDir;
/**
* Tests for {@link StemmerKnowledgeExperiment}.
*/
-@Tag("unit")
@Tag("integration")
@Tag("stemmer")
+@Tag("trie")
final class StemmerKnowledgeExperimentTest {
/**
diff --git a/src/test/java/org/egothor/stemmer/StemmerPatchTrieBinaryIOTest.java b/src/test/java/org/egothor/stemmer/StemmerPatchTrieBinaryIOTest.java
index a7171c0..bbcca83 100644
--- a/src/test/java/org/egothor/stemmer/StemmerPatchTrieBinaryIOTest.java
+++ b/src/test/java/org/egothor/stemmer/StemmerPatchTrieBinaryIOTest.java
@@ -38,6 +38,8 @@ import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.anyInt;
+import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.mockStatic;
import static org.mockito.Mockito.verify;
@@ -91,6 +93,8 @@ import org.mockito.MockedStatic;
@Tag("unit")
@Tag("io")
@Tag("persistence")
+@Tag("serialization")
+@Tag("trie")
@DisplayName("StemmerPatchTrieBinaryIO")
class StemmerPatchTrieBinaryIOTest {
@@ -299,9 +303,19 @@ class StemmerPatchTrieBinaryIOTest {
"read(Path) must reject null path."),
() -> assertThrows(NullPointerException.class, () -> StemmerPatchTrieBinaryIO.read((String) null),
"read(String) must reject null file name."),
+ () -> assertThrows(NullPointerException.class,
+ () -> StemmerPatchTrieBinaryIO.read((Path) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
+ "read(Path, int) must reject null path."),
+ () -> assertThrows(NullPointerException.class,
+ () -> StemmerPatchTrieBinaryIO.read((String) null,
+ FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
+ "read(String, int) must reject null file name."),
() -> assertThrows(NullPointerException.class,
() -> StemmerPatchTrieBinaryIO.read((ByteArrayInputStream) null),
- "read(InputStream) must reject null input stream."));
+ "read(InputStream) must reject null input stream."),
+ () -> assertThrows(NullPointerException.class,
+ () -> StemmerPatchTrieBinaryIO.read((ByteArrayInputStream) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
+ "read(InputStream, int) must reject null input stream."));
}
/**
@@ -385,6 +399,143 @@ class StemmerPatchTrieBinaryIOTest {
}
}
+ /**
+ * Verifies that stream overload with dense span override delegates to the
+ * four-argument readFrom method.
+ */
+ @SuppressWarnings("unchecked")
+ @Test
+ @DisplayName("Should delegate stream read with dense span override")
+ void shouldDelegateInputStreamReadWithDenseSpanOverride() throws IOException {
+ final FrequencyTrie expectedTrie = mock(FrequencyTrie.class);
+ final byte[] gzipPayload = gzip("binary-content-with-max-expanded-index");
+
+ try (@SuppressWarnings("rawtypes")
+ MockedStatic mockedStatic = mockStatic(FrequencyTrie.class)) {
+ mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
+ any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
+
+ final FrequencyTrie actualTrie = StemmerPatchTrieBinaryIO
+ .read(new ByteArrayInputStream(gzipPayload), 17);
+
+ assertSame(expectedTrie, actualTrie,
+ "read(InputStream, int) must return the trie produced by FrequencyTrie.readFrom(...).");
+
+ mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
+ any(FrequencyTrie.ValueStreamCodec.class), eq(17)));
+ }
+ }
+
+ /**
+ * Verifies that path overload with dense span override delegates to the
+ * same method overload with the override parameter.
+ */
+ @SuppressWarnings("unchecked")
+ @Test
+ @DisplayName("Should delegate path read with dense span override")
+ void shouldDelegatePathReadWithDenseSpanOverride() throws IOException {
+ final FrequencyTrie expectedTrie = mock(FrequencyTrie.class);
+ final Path sourceFile = temporaryDirectory.resolve("input-max-expanded.bin.gz");
+ Files.write(sourceFile, gzip("path-based-max-expanded-index"));
+
+ try (@SuppressWarnings("rawtypes")
+ MockedStatic mockedStatic = mockStatic(FrequencyTrie.class)) {
+ mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
+ any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
+
+ final FrequencyTrie actualTrie = StemmerPatchTrieBinaryIO.read(sourceFile, 0);
+
+ assertSame(expectedTrie, actualTrie,
+ "read(Path, int) must return the trie produced by FrequencyTrie.readFrom(...).");
+
+ mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
+ any(FrequencyTrie.ValueStreamCodec.class), eq(0)));
+ }
+ }
+
+ /**
+ * Verifies that string path overload with dense span override delegates to the
+ * same method overload with the override parameter.
+ */
+ @SuppressWarnings("unchecked")
+ @Test
+ @DisplayName("Should delegate file name read with dense span override")
+ void shouldDelegateStringReadWithDenseSpanOverride() throws IOException {
+ final FrequencyTrie expectedTrie = mock(FrequencyTrie.class);
+ final Path sourceFile = temporaryDirectory.resolve("input-string-max-expanded.bin.gz");
+ Files.write(sourceFile, gzip("string-based-max-expanded-index"));
+
+ try (@SuppressWarnings("rawtypes")
+ MockedStatic mockedStatic = mockStatic(FrequencyTrie.class)) {
+ mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
+ any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
+
+ final FrequencyTrie actualTrie = StemmerPatchTrieBinaryIO.read(sourceFile.toString(), 32);
+
+ assertSame(expectedTrie, actualTrie,
+ "read(String, int) must return the trie produced by FrequencyTrie.readFrom(...).");
+
+ mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
+ any(FrequencyTrie.ValueStreamCodec.class), eq(32)));
+ }
+ }
+
+ /**
+ * Verifies that metadata-only read parses and returns the persisted metadata.
+ */
+ @Test
+ @DisplayName("Should read metadata from gzip payload")
+ void shouldReadMetadataFromGzipPayload() throws IOException {
+ final FrequencyTrie.Builder builder = new FrequencyTrie.Builder(String[]::new,
+ ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
+ builder.put("run", PatchCommandEncoder.builder().build().encode("running", "run"));
+ final FrequencyTrie trie = builder.build();
+
+ final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+ StemmerPatchTrieBinaryIO.write(trie, outputStream);
+
+ final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(new ByteArrayInputStream(outputStream.toByteArray()));
+
+ assertEquals(trie.metadata(), metadata,
+ "readMetadata(InputStream) must return the same metadata persisted by write().");
+ }
+
+ /**
+ * Verifies that metadata can be read from a binary file path.
+ */
+ @Test
+ @DisplayName("Should read metadata from file path")
+ void shouldReadMetadataFromPath() throws IOException {
+ final FrequencyTrie.Builder builder = new FrequencyTrie.Builder(String[]::new,
+ ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
+ builder.put("city", PatchCommandEncoder.builder().build().encode("cities", "city"));
+ final FrequencyTrie trie = builder.build();
+
+ final Path sourceFile = temporaryDirectory.resolve("metadata-path.bin.gz");
+ StemmerPatchTrieBinaryIO.write(trie, sourceFile);
+
+ final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(sourceFile);
+ assertEquals(trie.metadata(), metadata);
+ }
+
+ /**
+ * Verifies that metadata can be read from a binary file name.
+ */
+ @Test
+ @DisplayName("Should read metadata from file name")
+ void shouldReadMetadataFromStringPath() throws IOException {
+ final FrequencyTrie.Builder builder = new FrequencyTrie.Builder(String[]::new,
+ ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
+ builder.put("city", PatchCommandEncoder.builder().build().encode("cities", "city"));
+ final FrequencyTrie trie = builder.build();
+
+ final Path sourceFile = temporaryDirectory.resolve("metadata-string.bin.gz");
+ StemmerPatchTrieBinaryIO.write(trie, sourceFile);
+
+ final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(sourceFile.toString());
+ assertEquals(trie.metadata(), metadata);
+ }
+
/**
* Verifies that malformed non-GZip input is reported as an I/O failure.
*/
diff --git a/src/test/java/org/egothor/stemmer/StemmerPatchTrieLoaderTest.java b/src/test/java/org/egothor/stemmer/StemmerPatchTrieLoaderTest.java
index f9f17ae..6778214 100644
--- a/src/test/java/org/egothor/stemmer/StemmerPatchTrieLoaderTest.java
+++ b/src/test/java/org/egothor/stemmer/StemmerPatchTrieLoaderTest.java
@@ -85,9 +85,10 @@ import org.junit.jupiter.params.provider.MethodSource;
* the current bundled language set, including right-to-left metadata
*
*/
-@Tag("unit")
@Tag("integration")
@Tag("stemmer")
+@Tag("io")
+@Tag("parser")
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
final class StemmerPatchTrieLoaderTest {
@@ -210,36 +211,43 @@ final class StemmerPatchTrieLoaderTest {
Arguments.of("14-load-binary-string",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null),
StemmerPatchTrieLoader.FILENAME_REQUIRED),
- Arguments.of("15-load-binary-stream",
+ Arguments.of("15-load-binary-path-override",
+ (ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((Path) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
+ "path"),
+ Arguments.of("16-load-binary-string-override",
+ (ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null,
+ FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
+ StemmerPatchTrieLoader.FILENAME_REQUIRED),
+ Arguments.of("17-load-binary-stream",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((InputStream) null),
"inputStream"),
- Arguments.of("16-save-binary-null-trie-path",
+ Arguments.of("18-save-binary-null-trie-path",
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath()), "trie"),
- Arguments.of("17-save-binary-null-path",
+ Arguments.of("19-save-binary-null-path",
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (Path) null), "path"),
- Arguments.of("18-save-binary-null-trie-string",
+ Arguments.of("20-save-binary-null-trie-string",
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath().toString()),
"trie"),
- Arguments.of("19-save-binary-null-string",
+ Arguments.of("21-save-binary-null-string",
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
StemmerPatchTrieLoader.FILENAME_REQUIRED),
- Arguments.of("20-load-language-null-metadata",
+ Arguments.of("22-load-language-null-metadata",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
true, (TrieMetadata) null),
"metadata"),
- Arguments.of("21-load-path-null-metadata",
+ Arguments.of("23-load-path-null-metadata",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (TrieMetadata) null),
"metadata"),
- Arguments.of("22-load-string-null-metadata",
+ Arguments.of("24-load-string-null-metadata",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
(TrieMetadata) null),
"metadata"),
- Arguments.of("23-load-binary-metadata-path-null",
+ Arguments.of("25-load-binary-metadata-path-null",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((Path) null), "path"),
- Arguments.of("24-load-binary-metadata-string-null",
+ Arguments.of("26-load-binary-metadata-string-null",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((String) null),
StemmerPatchTrieLoader.FILENAME_REQUIRED),
- Arguments.of("25-load-binary-metadata-stream-null",
+ Arguments.of("27-load-binary-metadata-stream-null",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((InputStream) null),
"inputStream"));
}
@@ -512,6 +520,44 @@ final class StemmerPatchTrieLoaderTest {
}
}
+ /**
+ * Verifies that binary load overloads with an explicit dense lookup span
+ * preserve trie semantics while honoring the dense-layout override.
+ */
+ @Test
+ @DisplayName("Binary dense-span override overloads should load equivalent tries")
+ void shouldLoadBinaryWithDenseSpanOverrideOverloads() throws IOException {
+ final Path dictionaryFile = writeDictionary("""
+ run running runs runner
+ city cities
+ study studies studying
+ """);
+ final Path binaryFile = tempDir.resolve("stemmer-trie-overrides.bin.gz");
+
+ final FrequencyTrie original = StemmerPatchTrieLoader.load(dictionaryFile, true,
+ DEFAULT_REDUCTION_MODE);
+
+ StemmerPatchTrieLoader.saveBinary(original, binaryFile);
+
+ final FrequencyTrie fromPathDefault = StemmerPatchTrieLoader.loadBinary(binaryFile);
+ final FrequencyTrie fromPathDefaultByNegative = StemmerPatchTrieLoader.loadBinary(binaryFile,
+ FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX);
+ final FrequencyTrie fromPathNoDense = StemmerPatchTrieLoader.loadBinary(binaryFile, 0);
+ final FrequencyTrie fromStringNoDense = StemmerPatchTrieLoader.loadBinary(binaryFile.toString(), 0);
+
+ assertTriePatchSemanticsEqual(original, fromPathDefault, "run", "running", "runner", "cities", "studying");
+ assertTriePatchSemanticsEqual(original, fromPathDefaultByNegative, "run", "running", "runner", "cities",
+ "studying");
+ assertTriePatchSemanticsEqual(original, fromPathNoDense, "run", "running", "runner", "cities", "studying");
+ assertTriePatchSemanticsEqual(original, fromStringNoDense, "run", "running", "runner", "cities",
+ "studying");
+
+ assertFalse(fromPathNoDense.root().hasDenseLookup(),
+ "Zero span should disable dense lookup on the loaded root.");
+ assertFalse(fromStringNoDense.root().hasDenseLookup(),
+ "Zero span should disable dense lookup on the loaded root.");
+ }
+
/**
* Writes a dictionary file into the temporary directory.
*
@@ -530,6 +576,7 @@ final class StemmerPatchTrieLoaderTest {
* Bundled dictionary integration tests.
*/
@Nested
+ @Tag("slow")
@DisplayName("Bundled dictionaries")
final class BundledDictionaryTests {
diff --git a/src/test/java/org/egothor/stemmer/StemmerPatchTrieProperties.java b/src/test/java/org/egothor/stemmer/StemmerPatchTrieProperties.java
index 23f8fc5..27474cc 100644
--- a/src/test/java/org/egothor/stemmer/StemmerPatchTrieProperties.java
+++ b/src/test/java/org/egothor/stemmer/StemmerPatchTrieProperties.java
@@ -44,7 +44,7 @@ import java.util.Set;
import net.jqwik.api.ForAll;
import net.jqwik.api.Label;
import net.jqwik.api.Property;
-import net.jqwik.api.Tag;
+import org.junit.jupiter.api.Tag;
/**
* Property-based tests for patch-command stemmer tries.
@@ -56,9 +56,8 @@ import net.jqwik.api.Tag;
* persistence must not alter that behavior.
*/
@Label("Stemmer patch trie properties")
-@Tag("unit")
@Tag("property")
-@Tag("stemming")
+@Tag("stemmer")
class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
/**
diff --git a/src/test/java/org/egothor/stemmer/TrieMetadataTest.java b/src/test/java/org/egothor/stemmer/TrieMetadataTest.java
index 9d7988a..6d0dfb8 100644
--- a/src/test/java/org/egothor/stemmer/TrieMetadataTest.java
+++ b/src/test/java/org/egothor/stemmer/TrieMetadataTest.java
@@ -40,6 +40,8 @@ import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
@Tag("unit")
+@Tag("metadata")
+@Tag("trie")
@DisplayName("TrieMetadata")
class TrieMetadataTest {
diff --git a/src/test/java/org/egothor/stemmer/WordTraversalDirectionTest.java b/src/test/java/org/egothor/stemmer/WordTraversalDirectionTest.java
index ce50176..1c8d376 100644
--- a/src/test/java/org/egothor/stemmer/WordTraversalDirectionTest.java
+++ b/src/test/java/org/egothor/stemmer/WordTraversalDirectionTest.java
@@ -40,6 +40,8 @@ import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
@Tag("unit")
+@Tag("core")
+@Tag("stemmer")
@DisplayName("WordTraversalDirection")
class WordTraversalDirectionTest {
diff --git a/src/test/java/org/egothor/stemmer/trie/ChildDescriptorTest.java b/src/test/java/org/egothor/stemmer/trie/ChildDescriptorTest.java
index 069a4a3..ee4c394 100644
--- a/src/test/java/org/egothor/stemmer/trie/ChildDescriptorTest.java
+++ b/src/test/java/org/egothor/stemmer/trie/ChildDescriptorTest.java
@@ -45,7 +45,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link ChildDescriptor}.
*/
@Tag("unit")
-@Tag("fast")
+@Tag("trie")
@DisplayName("ChildDescriptor")
class ChildDescriptorTest {
diff --git a/src/test/java/org/egothor/stemmer/trie/CompiledNodeAndNodeDataTest.java b/src/test/java/org/egothor/stemmer/trie/CompiledNodeAndNodeDataTest.java
index 96f7054..b8a9a93 100644
--- a/src/test/java/org/egothor/stemmer/trie/CompiledNodeAndNodeDataTest.java
+++ b/src/test/java/org/egothor/stemmer/trie/CompiledNodeAndNodeDataTest.java
@@ -31,8 +31,10 @@
package org.egothor.stemmer.trie;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
@@ -43,7 +45,6 @@ import org.junit.jupiter.api.Test;
* documented backing-array exposure.
*/
@Tag("unit")
-@Tag("fast")
@Tag("trie")
@DisplayName("CompiledNode and NodeData")
class CompiledNodeAndNodeDataTest {
@@ -141,4 +142,136 @@ class CompiledNodeAndNodeDataTest {
assertSame(orderedValues, node.orderedValues());
assertSame(orderedCounts, node.orderedCounts());
}
+
+ /**
+ * Verifies that dense lookup is used when the interval is compact.
+ */
+ @Test
+ @DisplayName("CompiledNode can resolve child via dense lookup table")
+ void compiledNodeUsesDenseLookupForCompactIntervals() {
+ @SuppressWarnings("unchecked")
+ final CompiledNode[] children = new CompiledNode[4];
+ children[0] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
+ children[1] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
+ children[2] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
+ children[3] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
+
+ final CompiledNode node = new CompiledNode<>(new char[] { 'a', 'b', 'c', 'd' }, children,
+ new String[] { "1", "2", "3", "4" }, new int[] { 1, 1, 1, 1 });
+
+ assertTrue(node.hasDenseLookup());
+
+ assertSame(children[0], node.findChild('a'));
+ assertSame(children[3], node.findChild('d'));
+ assertSame(null, node.findChild('z'));
+ }
+
+ /**
+ * Verifies that fallback linear scan is used for small node degree.
+ */
+ @Test
+ @DisplayName("CompiledNode resolves child by linear scan for small degree")
+ void compiledNodeUsesLinearScanForSmallDegree() {
+ @SuppressWarnings("unchecked")
+ final CompiledNode[] children = new CompiledNode[4];
+ final CompiledNode childA = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
+ final CompiledNode childB = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
+ final CompiledNode childC = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
+ final CompiledNode childD = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
+ children[0] = childA;
+ children[1] = childB;
+ children[2] = childC;
+ children[3] = childD;
+
+ final CompiledNode node = new CompiledNode<>(new char[] { 'a', 'z', '中', '你' }, children,
+ new String[] { "1", "2", "3", "4" }, 0, new int[] { 1, 1, 1, 1 });
+
+ assertFalse(node.hasDenseLookup());
+
+ assertSame(childA, node.findChild('a'));
+ assertSame(childD, node.findChild('你'));
+ assertSame(null, node.findChild('b'));
+ }
+
+ /**
+ * Verifies that fallback binary search is used for larger node degree without
+ * dense lookup.
+ */
+ @Test
+ @DisplayName("CompiledNode resolves child by binary search for large degree")
+ void compiledNodeUsesBinarySearchForLargeDegree() {
+ @SuppressWarnings("unchecked")
+ final CompiledNode[] children = new CompiledNode[5];
+ final CompiledNode childA = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
+ final CompiledNode childB = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
+ final CompiledNode childC = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
+ final CompiledNode childD = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
+ final CompiledNode childE = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
+ children[0] = childA;
+ children[1] = childB;
+ children[2] = childC;
+ children[3] = childD;
+ children[4] = childE;
+
+ final CompiledNode node = new CompiledNode<>(new char[] { 'a', 'c', 'k', 't', 'z' }, children,
+ new String[] { "1", "2", "3", "4", "5" }, 0, new int[] { 1, 1, 1, 1, 1 });
+
+ assertFalse(node.hasDenseLookup());
+
+ assertSame(childC, node.findChild('k'));
+ assertSame(childE, node.findChild('z'));
+ assertSame(null, node.findChild('x'));
+ }
+
+ /**
+ * Verifies the basic node-state helpers that are used by diagnostics and
+ * behavioral checks.
+ */
+ @Test
+ @DisplayName("CompiledNode reports leaf, value and edge presence state")
+ void compiledNodeReportsNodeStateHelpers() {
+ @SuppressWarnings("unchecked")
+ final CompiledNode[] childless = new CompiledNode[0];
+ final CompiledNode leaf = new CompiledNode<>(new char[0], childless, new String[0], new int[0]);
+
+ assertTrue(leaf.isLeaf());
+ assertFalse(leaf.hasChildren());
+ assertFalse(leaf.hasValues());
+ assertFalse(leaf.hasEdge('a'));
+
+ @SuppressWarnings("unchecked")
+ final CompiledNode[] child = new CompiledNode[1];
+ final String[] orderedValues = new String[] { "leaf" };
+ final int[] orderedCounts = new int[] { 1 };
+ child[0] = new CompiledNode<>(new char[0], new CompiledNode[0], orderedValues, orderedCounts);
+ final CompiledNode node = new CompiledNode<>(new char[] { 'a' }, child, orderedValues, orderedCounts);
+
+ assertFalse(node.isLeaf());
+ assertTrue(node.hasChildren());
+ assertTrue(node.hasValues());
+ assertTrue(node.valueCount() > 0);
+ assertTrue(node.hasEdge('a'));
+ assertFalse(node.hasEdge('b'));
+ }
+
+ /**
+ * Verifies structural equality and hash-code behavior for compiled nodes.
+ */
+ @Test
+ @DisplayName("CompiledNode equals and hashCode align for identical structure")
+ void compiledNodeEqualsAndHashCodeAlignForIdenticalStructure() {
+ @SuppressWarnings("unchecked")
+ final CompiledNode[] child = new CompiledNode[1];
+ final CompiledNode leaf = new CompiledNode<>(new char[0], new CompiledNode[0], new String[] { "v" },
+ new int[] { 1 });
+ child[0] = leaf;
+
+ final CompiledNode first = new CompiledNode<>(new char[] { 'a' }, child, new String[] { "x" },
+ new int[] { 2 });
+ final CompiledNode second = new CompiledNode<>(new char[] { 'a' }, child, new String[] { "x" },
+ new int[] { 2 });
+
+ assertEquals(first, second);
+ assertEquals(first.hashCode(), second.hashCode());
+ }
}
diff --git a/src/test/java/org/egothor/stemmer/trie/DominantLocalDescriptorTest.java b/src/test/java/org/egothor/stemmer/trie/DominantLocalDescriptorTest.java
index aaa250c..df9855f 100644
--- a/src/test/java/org/egothor/stemmer/trie/DominantLocalDescriptorTest.java
+++ b/src/test/java/org/egothor/stemmer/trie/DominantLocalDescriptorTest.java
@@ -41,7 +41,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link DominantLocalDescriptor}.
*/
@Tag("unit")
-@Tag("fast")
+@Tag("trie")
@DisplayName("DominantLocalDescriptor")
class DominantLocalDescriptorTest {
diff --git a/src/test/java/org/egothor/stemmer/trie/LocalValueSummaryTest.java b/src/test/java/org/egothor/stemmer/trie/LocalValueSummaryTest.java
index bf9ab6c..3c9e3bb 100644
--- a/src/test/java/org/egothor/stemmer/trie/LocalValueSummaryTest.java
+++ b/src/test/java/org/egothor/stemmer/trie/LocalValueSummaryTest.java
@@ -50,7 +50,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link LocalValueSummary}.
*/
@Tag("unit")
-@Tag("fast")
+@Tag("trie")
@DisplayName("LocalValueSummary")
class LocalValueSummaryTest {
diff --git a/src/test/java/org/egothor/stemmer/trie/MutableNodeTest.java b/src/test/java/org/egothor/stemmer/trie/MutableNodeTest.java
index 56dc468..c936e4d 100644
--- a/src/test/java/org/egothor/stemmer/trie/MutableNodeTest.java
+++ b/src/test/java/org/egothor/stemmer/trie/MutableNodeTest.java
@@ -44,7 +44,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link MutableNode}.
*/
@Tag("unit")
-@Tag("fast")
+@Tag("trie")
@DisplayName("MutableNode")
class MutableNodeTest {
diff --git a/src/test/java/org/egothor/stemmer/trie/RankedLocalDescriptorTest.java b/src/test/java/org/egothor/stemmer/trie/RankedLocalDescriptorTest.java
index 446a0c9..37abf38 100644
--- a/src/test/java/org/egothor/stemmer/trie/RankedLocalDescriptorTest.java
+++ b/src/test/java/org/egothor/stemmer/trie/RankedLocalDescriptorTest.java
@@ -41,7 +41,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link RankedLocalDescriptor}.
*/
@Tag("unit")
-@Tag("fast")
+@Tag("trie")
@DisplayName("RankedLocalDescriptor")
class RankedLocalDescriptorTest {
diff --git a/src/test/java/org/egothor/stemmer/trie/ReducedNodeTest.java b/src/test/java/org/egothor/stemmer/trie/ReducedNodeTest.java
index b93a01e..976cb54 100644
--- a/src/test/java/org/egothor/stemmer/trie/ReducedNodeTest.java
+++ b/src/test/java/org/egothor/stemmer/trie/ReducedNodeTest.java
@@ -48,7 +48,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link ReducedNode}.
*/
@Tag("unit")
-@Tag("fast")
+@Tag("trie")
@DisplayName("ReducedNode")
class ReducedNodeTest {
diff --git a/src/test/java/org/egothor/stemmer/trie/ReductionContextTest.java b/src/test/java/org/egothor/stemmer/trie/ReductionContextTest.java
index 96b729c..bff1c75 100644
--- a/src/test/java/org/egothor/stemmer/trie/ReductionContextTest.java
+++ b/src/test/java/org/egothor/stemmer/trie/ReductionContextTest.java
@@ -47,7 +47,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link ReductionContext}.
*/
@Tag("unit")
-@Tag("fast")
+@Tag("trie")
@DisplayName("ReductionContext")
class ReductionContextTest {
diff --git a/src/test/java/org/egothor/stemmer/trie/ReductionSignatureTest.java b/src/test/java/org/egothor/stemmer/trie/ReductionSignatureTest.java
index d4a8b56..676f354 100644
--- a/src/test/java/org/egothor/stemmer/trie/ReductionSignatureTest.java
+++ b/src/test/java/org/egothor/stemmer/trie/ReductionSignatureTest.java
@@ -46,7 +46,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link ReductionSignature}.
*/
@Tag("unit")
-@Tag("fast")
+@Tag("trie")
@DisplayName("ReductionSignature")
class ReductionSignatureTest {
diff --git a/src/test/java/org/egothor/stemmer/trie/UnorderedLocalDescriptorTest.java b/src/test/java/org/egothor/stemmer/trie/UnorderedLocalDescriptorTest.java
index 664ce23..f2bdf0a 100644
--- a/src/test/java/org/egothor/stemmer/trie/UnorderedLocalDescriptorTest.java
+++ b/src/test/java/org/egothor/stemmer/trie/UnorderedLocalDescriptorTest.java
@@ -41,7 +41,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link UnorderedLocalDescriptor}.
*/
@Tag("unit")
-@Tag("fast")
+@Tag("trie")
@DisplayName("UnorderedLocalDescriptor")
class UnorderedLocalDescriptorTest {