feat: implement dense-child optimized trie lookup and enterprise test/CI profile hardening

This commit is contained in:
2026-05-16 03:24:07 +02:00
parent 50c3ab3432
commit dadab5514e
44 changed files with 2052 additions and 294 deletions

View File

@@ -51,7 +51,7 @@ jobs:
test -f gradle/verification-metadata.xml
- name: Execute build, tests, PMD, coverage, Javadoc, distribution packaging, and SBOM generation
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport distZip cyclonedxBom
run: ./gradlew --no-daemon clean ciRelease distZip pmdMain javadoc jacocoCiReleaseReport cyclonedxBom
- name: Upload SBOM
if: always()
@@ -70,8 +70,8 @@ jobs:
with:
name: test-reports
path: |
build/reports/tests/test
build/test-results/test
build/reports/tests
build/test-results
if-no-files-found: warn
retention-days: 14
@@ -90,8 +90,8 @@ jobs:
with:
name: coverage-reports
path: |
build/reports/jacoco/test/html
build/reports/jacoco/test/jacocoTestReport.xml
build/reports/jacoco/jacocoCiReleaseReport/html
build/reports/jacoco/jacocoCiReleaseReport/jacocoCiReleaseReport.xml
if-no-files-found: warn
retention-days: 14
@@ -160,7 +160,7 @@ jobs:
env:
SIGNING_KEY: ${{ secrets.SIGNING_KEY }}
SIGNING_PASSWORD: ${{ secrets.SIGNING_PASSWORD }}
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport cyclonedxBom centralBundle
run: ./gradlew --no-daemon clean ciRelease distZip pmdMain javadoc jacocoCiReleaseReport cyclonedxBom centralBundle
- name: Generate release changelog
shell: bash

View File

@@ -70,7 +70,7 @@ jobs:
test -f gradle/verification-metadata.xml
- name: Build reports for publication
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport pitest jmh cyclonedxBom
run: ./gradlew --no-daemon clean ciRelease pmdMain javadoc jacocoCiReleaseReport pitest jmh cyclonedxBom
- name: Prepare gh-pages worktree
shell: bash
@@ -93,6 +93,9 @@ jobs:
run: |
set -euo pipefail
TEST_REPORT_DIR="build/reports/tests/ciRelease"
JACOCO_REPORT_DIR="build/reports/jacoco/jacocoCiReleaseReport"
SITE_DIR=".gh-pages"
RUN_DIR="${SITE_DIR}/builds/${GITHUB_RUN_NUMBER}"
RUN_METRICS_DIR="${RUN_DIR}/metrics"
@@ -106,14 +109,14 @@ jobs:
cp -R build/docs/javadoc "${RUN_DIR}/javadoc"
cp -R build/docs/javadoc "${LATEST_DIR}/javadoc"
cp -R build/reports/tests/test "${RUN_DIR}/test"
cp -R build/reports/tests/test "${LATEST_DIR}/test"
cp -R "${TEST_REPORT_DIR}" "${RUN_DIR}/test"
cp -R "${TEST_REPORT_DIR}" "${LATEST_DIR}/test"
cp -R build/reports/pmd "${RUN_DIR}/pmd"
cp -R build/reports/pmd "${LATEST_DIR}/pmd"
cp -R build/reports/jacoco/test/html "${RUN_DIR}/coverage"
cp -R build/reports/jacoco/test/html "${LATEST_DIR}/coverage"
cp -R "${JACOCO_REPORT_DIR}/html" "${RUN_DIR}/coverage"
cp -R "${JACOCO_REPORT_DIR}/html" "${LATEST_DIR}/coverage"
cp -R build/reports/pitest "${RUN_DIR}/pitest"
cp -R build/reports/pitest "${LATEST_DIR}/pitest"
@@ -178,7 +181,7 @@ jobs:
python3 \
./tools/generate-pages-badges.py \
--jacoco-xml build/reports/jacoco/test/jacocoTestReport.xml \
--jacoco-xml "${JACOCO_REPORT_DIR}/jacocoCiReleaseReport.xml" \
--pit-xml build/reports/pitest/mutations.xml \
--jmh-csv build/reports/jmh/jmh-results.csv \
--run-metrics-dir "${RUN_METRICS_DIR}" \
@@ -228,7 +231,7 @@ jobs:
<p class="meta">Build ${GITHUB_RUN_NUMBER} from commit ${GITHUB_SHA}</p>
<ul>
<li><a href="./javadoc/">Javadoc</a></li>
<li><a href="./test/">Test Report</a></li>
<li><a href="./test/">Release Verification Test Report (ciRelease)</a></li>
<li><a href="./pmd/main.html">PMD Report</a></li>
<li><a href="./coverage/">Coverage Report</a></li>
${DEPENDENCY_CHECK_LINK:-<li>Dependency Vulnerability Report: not available</li>}
@@ -260,7 +263,7 @@ jobs:
- [Latest build summary](https://leogalambos.github.io/Radixor/builds/latest/)
- [Javadoc](https://leogalambos.github.io/Radixor/builds/latest/javadoc/)
- [Unit test report](https://leogalambos.github.io/Radixor/builds/latest/test/)
- [Release verification test report (ciRelease)](https://leogalambos.github.io/Radixor/builds/latest/test/)
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)

View File

@@ -167,6 +167,9 @@ The repository keeps the front page concise and places detailed documentation un
- [Architecture](docs/architecture.md)
Structural model, data flow, and runtime lookup behavior.
- [Lookup Edge Optimization](docs/lookup-edge-optimization.md)
Speed/memory trade-off of dense child edge lookup in compiled tries.
- [Reduction Semantics](docs/reduction-semantics.md)
Ranked, unordered, and dominant reduction behavior.

View File

@@ -108,9 +108,19 @@ dependencyCheck {
}
}
tasks.withType(Test).configureEach {
useJUnitPlatform()
def cliIncludeTags = project.findProperty('includeTags')?.toString() ?: System.getProperty('includeTags')
def cliExcludeTags = project.findProperty('excludeTags')?.toString() ?: System.getProperty('excludeTags')
def splitTagExpression = { String tagsExpr ->
if (tagsExpr == null || tagsExpr.isBlank()) {
return []
}
return tagsExpr.split(',')
.collect { it.trim() }
.findAll { it != null && !it.isBlank() }
}
tasks.withType(Test).configureEach {
doFirst {
jvmArgs "-javaagent:${configurations.mockitoAgent.singleFile}"
}
@@ -123,12 +133,125 @@ tasks.withType(Test).configureEach {
minHeapSize = '1g'
maxHeapSize = '4g'
reports {
junitXml.required = true
html.required = true
}
}
def configureJUnitPlatformTags = { Test task, String includeTagsExpr, String excludeTagsExpr ->
task.useJUnitPlatform {
final def includes = splitTagExpression(includeTagsExpr)
final def excludes = splitTagExpression(excludeTagsExpr)
if (!includes.isEmpty()) {
includeTags(*includes.toArray(new String[0]))
}
if (!excludes.isEmpty()) {
excludeTags(*excludes.toArray(new String[0]))
}
}
}
tasks.named('test', Test) {
configureJUnitPlatformTags(it, cliIncludeTags, cliExcludeTags)
finalizedBy(tasks.named('jacocoTestReport'))
}
def configureTaggedTestProfile = { String taskName, String includeTagsExpr, String excludeTagsExpr = null,
String taskDescription = null, String testNameExcludePatterns = null ->
tasks.register(taskName, Test) {
group = 'verification'
description = taskDescription
configureJUnitPlatformTags(delegate as Test, includeTagsExpr, excludeTagsExpr)
testClassesDirs = sourceSets.test.output.classesDirs
classpath = sourceSets.test.runtimeClasspath
dependsOn(tasks.named('compileTestJava'))
doFirst {
jvmArgs "-javaagent:${configurations.mockitoAgent.singleFile}"
}
if (testNameExcludePatterns != null && !testNameExcludePatterns.isBlank()) {
filter {
testNameExcludePatterns.split(',').each { String pattern ->
final def trimmedPattern = pattern.trim()
if (!trimmedPattern.isEmpty()) {
excludeTestsMatching(trimmedPattern)
}
}
}
}
minHeapSize = '1g'
maxHeapSize = '4g'
reports {
junitXml.required = true
html.required = true
}
}
}
configureTaggedTestProfile(
'ciSmoke',
'unit',
'slow',
'Fast feedback profile for unit tests with slow tests explicitly excluded.',
'org.egothor.stemmer.CompileIntegrationTest*'
)
configureTaggedTestProfile(
'ciCore',
'unit,trie,frequency-trie,property',
null,
'Focused profile for core trie behavior and trie-specific property checks.'
)
configureTaggedTestProfile(
'ciIntegration',
'integration',
'slow',
'Integration pipeline profile (loader/parser/CLI/IO end-to-end flows) excluding slow integration paths.'
)
configureTaggedTestProfile(
'ciSlow',
'slow',
null,
'Targeted profile for all slow tests (large dictionaries, long-running corpus validation, and heavy integration checks).'
)
configureTaggedTestProfile(
'ciCompat',
'compat,regression',
null,
'Compatibility profile guarding persisted artifact and compatibility regressions.'
)
configureTaggedTestProfile(
'ciRelease',
null,
'slow',
'Release-profile validation of all non-slow tests.',
'org.egothor.stemmer.CompileIntegrationTest*,org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*'
)
configureTaggedTestProfile(
'ciNightly',
'fuzz',
null,
'Nightly robustness profile with fuzz testing emphasis.'
)
tasks.register('ci') {
group = 'verification'
description = 'Runs the full enterprise CI profile set in sequence.'
dependsOn(tasks.named('ciSmoke'))
dependsOn(tasks.named('ciCore'))
dependsOn(tasks.named('ciIntegration'))
dependsOn(tasks.named('ciCompat'))
}
tasks.withType(Pmd).configureEach {
@@ -155,6 +278,36 @@ tasks.named('jacocoTestReport', JacocoReport) {
}
}
def registerJacocoProfileReport = { String reportTaskName, String sourceTaskName ->
tasks.register(reportTaskName, JacocoReport) {
group = 'verification'
description = "Generates Jacoco report for ${sourceTaskName} execution."
dependsOn(tasks.named(sourceTaskName))
classDirectories.setFrom(
files(sourceSets.main.output).asFileTree.matching {
exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*'
exclude 'org/egothor/stemmer/DiacriticStripper*'
}
)
executionData.setFrom(
fileTree(layout.buildDirectory.dir('jacoco')) {
include "${sourceTaskName}.exec"
}
)
reports {
xml.required = true
csv.required = false
html.required = true
}
}
}
registerJacocoProfileReport('jacocoCiReleaseReport', 'ciRelease')
tasks.named('check') {
dependsOn(tasks.named('jacocoTestReport'))
// no-default, only on-demand: dependsOn(tasks.named('dependencyCheckAnalyze'))

View File

@@ -0,0 +1,193 @@
# Lookup Edge Optimization
Compiled trie nodes (`CompiledNode`) use three lookup strategies when resolving child edges:
1. dense array direct lookup,
2. linear scan for very small child counts,
3. binary search over sorted edge labels.
This page explains the dense path, what `maxExpandedIndex` controls, and how to tune it.
## Runtime model of one node
For a node with sorted edge labels `char[] edges`, the implementation can materialize an
index-aligned dense table when labels occupy a small compact code-point interval:
```text
span = maxEdge - minEdge
use dense table iff (span <= maxExpandedIndex) and (maxExpandedIndex > 0)
```
When dense lookup is used, lookup is constant-time indexing:
```text
denseIndex = requestedEdge - minEdge
return denseChildren[denseIndex] // or null if outside interval
```
When dense lookup is not active (interval is too wide or the configured
`maxExpandedIndex` is `0`), `CompiledNode` still chooses between two fallback
strategies:
- **linear scan** for very small child counts (`4` or fewer children),
- **binary search** for larger child counts.
This means the fallback method is selected by child count, not by “distance” alone.
`linear scan` is therefore used when there are only a few edges even if those edges are
spread across very distant code points.
### Example: few edges, wide Unicode span
```text
edges = ['a', '中', '你']
edge count = 3
minEdge = 'a' (U+0061)
maxEdge = '你' (U+4F60)
span = 20319
```
- If `maxExpandedIndex = 512`, dense indexing is not used because `span > maxExpandedIndex`.
- Because `edge count = 3` (<= 4), lookup falls back to a tiny linear scan of the
three labels.
- This is exactly the case where you get benefit from the threshold even though the interval is wide.
This is useful for non-Latin scripts as well: what matters is interval width in Unicode
code points, not script name. A compact Arabic-range block can still benefit from dense
lookups when keys stay in a tight code-point interval.
## Why this is configurable
`maxExpandedIndex` is only a performance/paging choice:
- higher value:
- more compact intervals qualify for dense tables,
- more constant-time child lookup,
- more memory for dense tables in qualifying nodes.
- lower value (or `0`):
- less dense-table allocation,
- fewer branches into constant-time path,
- lower materialization memory.
The value never changes lookup semantics. It only changes the in-memory structure shape.
## Persistence and loading model
This threshold is **not** stored in `TrieMetadata`.
- The binary format stores only trie payload and semantic metadata (`reduction`, `traversal`,
case/diacritic settings, and stream version).
- `maxExpandedIndex` is chosen when materializing nodes in memory.
- You can therefore keep one persisted artifact and load it with different in-memory
trade-offs depending on deployment constraints.
## Default
- `FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX == 512`
- `CompiledNode.DEFAULT_MAX_EXPANDED_INDEX == 512`
These are practical defaults for mixed-language text and Latin-like scripts where edge labels
often cluster.
## Tune during build (writable phase)
Use the full `FrequencyTrie.Builder` constructor when you are compiling from source data.
The builder threshold is applied while freezing reduced nodes into the immutable form.
```java
import org.egothor.stemmer.CaseProcessingMode;
import org.egothor.stemmer.DiacriticProcessingMode;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.ReductionSettings;
import org.egothor.stemmer.WordTraversalDirection;
final ReductionSettings settings = ReductionSettings.withDefaults(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
final FrequencyTrie.Builder<String> fastBuilder =
new FrequencyTrie.Builder<>(String[]::new,
settings,
WordTraversalDirection.BACKWARD,
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
DiacriticProcessingMode.AS_IS,
1024); // prefer lookup speed
// ... put(...) ...
final FrequencyTrie<String> trie = fastBuilder.build();
```
Use `0` or `256` for lower memory while still building larger tries.
```java
final FrequencyTrie.Builder<String> compactBuilder =
new FrequencyTrie.Builder<>(String[]::new,
settings,
WordTraversalDirection.BACKWARD,
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
DiacriticProcessingMode.AS_IS,
256); // lower memory profile
```
## Tune when loading a binary artifact (runtime phase)
At artifact load time, you can tune the same trade-off independently of persisted metadata.
```java
import java.nio.file.Path;
import org.egothor.stemmer.StemmerPatchTrieLoader;
var defaultLookup = StemmerPatchTrieLoader.loadBinary(
Path.of("stemmers", "english.radixor.gz"));
var fastLookup = StemmerPatchTrieLoader.loadBinary(
Path.of("stemmers", "english.radixor.gz"), 1024);
var compactLookup = StemmerPatchTrieLoader.loadBinary(
Path.of("stemmers", "english.radixor.gz"), 0);
```
You can also set the threshold directly with `FrequencyTrie.readFrom(...)` when reading streams:
```java
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.zip.GZIPInputStream;
import org.egothor.stemmer.FrequencyTrie;
public final class StreamLoadExample {
private StreamLoadExample() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) throws IOException {
try (InputStream fileInput = Files.newInputStream(Path.of("stemmers", "english.radixor.gz"));
GZIPInputStream gzip = new GZIPInputStream(fileInput);
DataInputStream dataInput = new DataInputStream(gzip)) {
final FrequencyTrie<String> compactOnLoad = FrequencyTrie.readFrom(
dataInput,
String[]::new,
input -> input.readUTF(),
256);
}
}
}
```
Note: the string codec is intentionally inline in this snippet to keep it self-contained.
## Practical guidance
- Start with default (`512`) in production and profile before changing it.
- Use `0` when memory is the priority and query throughput is not the bottleneck.
- Use values around `1024` for workloads dominated by compact alphabets and very hot lookups.
Trade-off expectation:
- increasing `maxExpandedIndex` improves lookup speed when edges tend to occupy short spans,
- decreasing it reduces per-node auxiliary memory in dense-span nodes.

View File

@@ -87,6 +87,43 @@ public final class LoadBinaryExample {
The binary format is the native `FrequencyTrie` serialization wrapped in GZip compression. It includes persisted `TrieMetadata`, so lookup after loading uses the traversal, case-processing, diacritic-processing, and reduction settings captured when the trie was compiled.
## Tune child lookup density when loading binaries
To optimize hot-path latency, you can tune direct child indexing by passing `maxExpandedIndex`
at load time. This does not change persisted metadata, only the materialized in-memory form.
```java
import java.io.IOException;
import java.nio.file.Path;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.StemmerPatchTrieLoader;
public final class LoadBinaryWithDenseLookupExample {
private LoadBinaryWithDenseLookupExample() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) throws IOException {
final FrequencyTrie<String> balanced = StemmerPatchTrieLoader.loadBinary(
Path.of("stemmers", "english.radixor.gz"));
final FrequencyTrie<String> fast = StemmerPatchTrieLoader.loadBinary(
Path.of("stemmers", "english.radixor.gz"),
1024);
final FrequencyTrie<String> compact = StemmerPatchTrieLoader.loadBinary(
Path.of("stemmers", "english.radixor.gz"),
0);
}
}
```
Negative values still use `FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX`.
[Lookup Edge Optimization](lookup-edge-optimization.md) describes the trade-off in detail and examples for build-time tuning as well.
## Build directly with a mutable builder
A `FrequencyTrie.Builder<V>` accepts repeated `put(key, value)` calls and compiles the final read-only trie through `build()`. Compilation performs bottom-up reduction and produces the compact immutable runtime representation.

View File

@@ -25,6 +25,7 @@ This is why Radixor can generalize beyond explicitly listed forms and why compil
The programmatic API is easier to understand when split by developer task:
- [Loading and Building Stemmers](programmatic-loading-and-building.md) explains how to acquire a compiled stemmer from bundled resources, textual dictionaries, binary artifacts, or direct builder usage.
- [Lookup Edge Optimization](lookup-edge-optimization.md) explains dense child lookup tuning and the speed/memory trade-off when materializing compiled tries.
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md) explains `get(...)`, `getAll(...)`, `getEntries(...)`, patch application, and the practical meaning of reduction modes.
- [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md) explains how to reopen compiled tries, add new lexical data, rebuild them, and store them as binary artifacts.

View File

@@ -58,6 +58,27 @@ A deterministic system is easier to test, easier to reason about, and safer to i
The project is intended to maintain very high confidence in both core correctness and behavioral stability.
The recommended execution strategy is defined by the tagged test profiles in [Test taxonomy and execution filtering](test-taxonomy-and-filtering.md). In practice, teams can execute profile tasks directly:
- `./gradlew ciSmoke`: fast local/PR safety checks (`unit`, excluding `slow`; additionally excludes
`CompileIntegrationTest` as a defensive safeguard).
- `./gradlew ciSlow`: enterprise heavy gate for all tests marked with `slow` (typically
production dictionary and large corpus verification). This should be used for scheduled/manual
hardening gates and not in standard release build.
- `./gradlew ciCore`: behavioral coverage of trie and frequency-trie paths (`unit` + `property` where applicable)
- `./gradlew ciIntegration`: pipeline and CLI integration path checks
- `./gradlew ciCompat`: compatibility and regression verification for persisted artifacts
- `./gradlew ciRelease`: full non-slow suite for release-confidence runs (all test tags except `slow`,
plus explicit name-based exclusion of `CompileIntegrationTest*` and
`StemmerPatchTrieLoaderTest$BundledDictionaryTests*` as additional guardrails)
- `./gradlew ciNightly`: extended fuzz profile for robustness hardening
- `./gradlew ci`: umbrella profile depending on smoke/core/integration/compat
## Test taxonomy and execution filtering
The full tag taxonomy and executable filter examples are documented in
[Test taxonomy and execution filtering](test-taxonomy-and-filtering.md).
### Structural coverage
High code coverage is treated as a useful signal, but not as a sufficient goal on its own. Coverage is valuable only when the covered scenarios actually pressure the implementation in meaningful ways.

View File

@@ -67,6 +67,36 @@ public final class LoadBinaryStemmerExample {
}
```
You can tune in-memory child lookup density at load time without changing the artifact:
```java
import java.io.IOException;
import java.nio.file.Path;
import org.egothor.stemmer.FrequencyTrie;
import org.egothor.stemmer.StemmerPatchTrieLoader;
public final class LoadBinaryStemmerExampleTuned {
private LoadBinaryStemmerExampleTuned() {
throw new AssertionError("No instances.");
}
public static void main(final String[] arguments) throws IOException {
final FrequencyTrie<String> fast = StemmerPatchTrieLoader.loadBinary(
Path.of("stemmers", "english.radixor.gz"),
1024);
final FrequencyTrie<String> compact = StemmerPatchTrieLoader.loadBinary(
Path.of("stemmers", "english.radixor.gz"),
128);
System.out.println("fast=" + fast.size() + ", compact=" + compact.size());
}
}
```
For the trade-off details, see [Lookup Edge Optimization](lookup-edge-optimization.md).
### Build or extend a stemmer from dictionary data
Radixor can also build a compiled trie from a custom dictionary. Dictionary lines consist of a canonical stem followed by zero or more variants. The input may be plain UTF-8 text or GZip-compressed UTF-8 text when loaded from a filesystem path. The parser applies `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), ignores leading and trailing whitespace around columns, supports line remarks introduced by `#` or `//`, and skips dictionary items that contain embedded whitespace.

View File

@@ -23,7 +23,7 @@ These reports are primarily useful when reviewing the published API surface and
These reports describe the outcome of core verification and static-analysis stages for the latest published build:
- [Unit test report](https://leogalambos.github.io/Radixor/builds/latest/test/)
- [Release verification test report (ciRelease)](https://leogalambos.github.io/Radixor/builds/latest/test/)
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)

View File

@@ -0,0 +1,216 @@
# Test Tag Taxonomy and Execution Guide
Radixor uses JUnit tags as an explicit execution policy for its test suite.
The project uses three orthogonal axes:
1. **Scope** (how the test is executed in the pipeline)
2. **Domain** (where in the system it belongs)
3. **Intent** (what behavior it verifies)
## Canonical scope tags
| Tag | Description | Typical usage |
| --- | --- | --- |
| `unit` | Fast, deterministic tests that exercise a specific class or behavior without external processes. | Default developer feedback; should stay near-zero flakiness and low run time. |
| `integration` | Tests that span multiple components or end-to-end flows of the public pipeline. | Parser/loader/CLI/IO integration checks and multi-step compile-then-load validations. |
| `property` | Property-based tests with generator-driven coverage for invariants. | Semantics-preserving laws and edge-case exploration beyond curated fixtures. |
| `fuzz` | Randomized stress checks with bounded runtime. | Heavier probabilistic verification of robustness and reduction invariants. |
| `compat` | Backward/forward compatibility and reproducibility checks for persisted artifacts. | Artifact fingerprints, deterministic rebuild, and regression fixtures. |
| `slow` | Long-running or expensive tests that should not execute in every fast gate. | Heavy fuzz/property budgets or high-duration integration checks. |
## Canonical domain tags
| Tag | Description | Typical usage |
| --- | --- | --- |
| `core` | Core algorithm and foundational platform behavior. | Traversal direction, base data structures, low-level helpers. |
| `trie` | All mutable/compiled trie behaviors and traversal internals. | Lookup path selection, node shape, child representation, subtree behavior. |
| `frequency-trie` | Algorithms and corner cases specific to frequency-aware trie logic. | Ranking, weighted reductions, persistence of weighted nodes. |
| `stemmer` | End-user stemming pipeline semantics. | Parse-encode-apply flows and output invariants. |
| `patch` | Patch encoding, decoding, and application semantics. | `PatchCommandEncoder` behavior and related compatibility contracts. |
| `io` | Input/output and resource loading boundaries. | Filesystem readers, streams, and stream lifecycle handling. |
| `serialization` | Binary persistence contract of compiled artifacts. | Versioned format reads/writes and checksum/consistency checks. |
| `parser` | Dictionary and metadata parsing concerns. | Dictionary input parsing and malformed-source rejection. |
| `cli` | Command-line entrypoint and command orchestration behavior. | Compile CLI integration and CLI argument validation. |
| `metadata` | Trie metadata semantics, compatibility fields, and schema expectations. | Version flags, structural properties, and metadata round-trips. |
| `compile` | Compile-time pipeline and build-oriented behavior. | Building, reduction-mode behavior, and compiled artifact generation. |
| `diacritic` | Unicode diacritic normalization and stripping behavior. | Accent-removal correctness and locale-safe normalization checks. |
## Canonical intent tags
| Tag | Description | Typical usage |
| --- | --- | --- |
| `construction` | Tests around construction and assembly of runtime structures. | Builders, loaders, and compile-time object construction contracts. |
| `lookup` | Read behavior and retrieval semantics. | `get()`, `getAll()`, traversal and missing-key behavior. |
| `persistence` | Storage lifecycle semantics. | Serialization/deserialization and round-trip correctness. |
| `reduction` | Reduction algorithm correctness and corner cases. | Dominance threshold, subtree deduplication, rank-preservation invariants. |
| `encoding` | Encoding transformation direction. | `PatchCommandEncoder.encode` and serialized command form generation. |
| `decoding` | Decoding/interpretation of persisted or runtime commands. | Optional consumers that parse and apply encoded command payloads. |
| `apply` | Patch application and transformation behavior. | Verifies that applied patches produce expected derived forms. |
| `normalization` | Canonicalization and cleanup behavior. | String normalization around case/shape and mirrored input paths. |
| `validation` | Input rejection and defensive checks. | Null/empty/invalid contracts and explicit failure conditions. |
| `regression` | Guard tests for behavior changes over time. | Known historical bugs and behavioral drift prevention. |
| `determinism` | Repeatable results under fixed input and settings. | Compile determinism, stable ordering, and artifact reproducibility. |
| `error-handling` | Exception surface and robustness expectations. | Recovery/failure modes and diagnostics quality. |
## Class-level rules
1. Every test class has **exactly one** scope tag.
2. Every test class has at least one domain tag.
3. Additional tags describe intent and may be used on classes or nested tests.
4. For each test class, intent tags should reflect the primary behavior under test, not historical naming conventions.
## Governance and execution policy
The following rules are used to keep the suite auditable and stable:
| Rule | Required state | Why |
| --- | --- | --- |
| Scope discipline | Exactly one scope tag per class. | Prevents accidental promotion of integration-only behavior into fast unit runs. |
| Coverage breadth | At least one domain tag per class. | Ensures tests can be grouped by subsystem for targeted review. |
| Intent specificity | Use at least one intent tag when behavior is non-trivial. | Makes failure triage faster and profile composition explicit. |
| Runtime policy | Never run `slow` tests in the default `unit` profile unless explicitly required. | Preserves turnaround for PR feedback while preserving deep checks. |
| Change risk | Any persistence or compatibility-affecting change must include `compat` in validation. | Protects long-lived binary artifact contracts. |
| Mutation resistance | `fuzz`/`property` sets should be gated to dedicated profiles. | Limits flakiness exposure and controls CI resource cost. |
## Suggested CI profiles
These are recommended launch profiles for local and CI usage and are also exposed as Gradle tasks:
- **Profile: `ci-smoke` (fast feedback):**
```
./gradlew test -DincludeTags=unit -DexcludeTags=slow
./gradlew ciSmoke
```
`ciSmoke` also excludes `org.egothor.stemmer.CompileIntegrationTest*` at test-name filter level as a
defensive fallback in case of future tag drift.
`ciRelease` also excludes
`org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*` at filter level.
- **Profile: `ci-core` (core behavioral coverage):**
```
./gradlew test -DincludeTags=unit,trie,frequency-trie,property
./gradlew ciCore
```
- **Profile: `ci-integration` (pipeline correctness):**
```
./gradlew test -DincludeTags=integration
./gradlew ciIntegration
```
- **Profile: `ci-slow` (explicit heavy validation):**
```
./gradlew ciSlow
```
- **Profile: `ci-compat` (artifact stability):**
```
./gradlew test -DincludeTags=compat,regression
./gradlew ciCompat
```
- **Profile: `ci-release` (strong confidence before release):**
```
./gradlew test -DexcludeTags=slow
./gradlew ciRelease
```
`ciRelease` is non-slow by policy and uses the same defensive name-based exclusion for
`org.egothor.stemmer.CompileIntegrationTest*` and
`org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*` in addition to tag filtering.
- **Profile: `ci-nightly` (extended hardening):**
```
./gradlew test -DincludeTags=fuzz
./gradlew ciNightly
```
- **Profile: `ci` (enterprise umbrella):**
```
./gradlew ci
```
`ci` and `ciRelease` intentionally do **not** include `slow` paths. Run `ciSlow` explicitly for production-dictionary stress and long-running corpus checks.
## Practical examples
All examples use Gradle with JUnit Platform integration:
- Only unit tests:
```
./gradlew test -DincludeTags=unit
```
- Integration tests only:
```
./gradlew test -DincludeTags=integration
```
- Only trie subsystem tests:
```
./gradlew test -DincludeTags=trie
```
- Deterministic fuzz checks:
```
./gradlew test -DincludeTags=fuzz
```
- Property tests:
```
./gradlew test -DincludeTags=property
```
- Stemmer + patch command behavior:
```
./gradlew test -DincludeTags=stemmer,patch
```
- Compatibility artifacts and regression checks:
```
./gradlew test -DincludeTags=compat
```
- Keep regression suite and remove long-running cases:
```
./gradlew test -DincludeTags=regression -DexcludeTags=slow
```
- Core + patch behavior:
```
./gradlew test -DincludeTags=trie,patch
```
- Deterministic compatibility and persistence checks:
```
./gradlew test -DincludeTags=compat,determinism,serialization
```
## Notes
- `-DincludeTags` and `-DexcludeTags` are interpreted by Gradle task filtering and forwarded into
JUnit tag filtering.
- Class-name filtering is also available via Gradle test selectors where needed
(for example, `--tests *CompileTest`), but tag filtering remains the default
execution strategy.
- `-DincludeTags` supports comma-separated literal tags. When you need a single exact tag with special
characters, quote the argument for the shell.

View File

@@ -84,7 +84,7 @@ publishing {
}
signing {
required { !version.toString().endsWith('-SNAPSHOT') }
required = !version.toString().endsWith('-SNAPSHOT')
if (signingKey != null && !signingKey.isBlank()) {
useInMemoryPgpKeys(signingKey, signingPassword)
sign publishing.publications.mavenJava

View File

@@ -54,6 +54,7 @@ nav:
- Overview: architecture-and-reduction.md
- Architecture: architecture.md
- Reduction Semantics: reduction-semantics.md
- Lookup Edge Optimization: lookup-edge-optimization.md
- Compatibility and Guarantees: compatibility-and-guarantees.md
- Dictionaries:
@@ -63,3 +64,4 @@ nav:
- Quality and Operations: quality-and-operations.md
- Benchmarking: benchmarking.md
- Reports: reports.md
- Test taxonomy and execution filtering: test-taxonomy-and-filtering.md

View File

@@ -51,7 +51,6 @@ import java.util.logging.Logger;
import org.egothor.stemmer.trie.CompiledNode;
import org.egothor.stemmer.trie.LocalValueSummary;
import org.egothor.stemmer.trie.MutableNode;
import org.egothor.stemmer.trie.NodeData;
import org.egothor.stemmer.trie.ReducedNode;
import org.egothor.stemmer.trie.ReductionContext;
import org.egothor.stemmer.trie.ReductionSignature;
@@ -87,7 +86,6 @@ import org.egothor.stemmer.trie.ReductionSignature;
*
* @param <V> value type
*/
@SuppressWarnings("PMD.CyclomaticComplexity")
public final class FrequencyTrie<V> {
/**
@@ -130,11 +128,54 @@ public final class FrequencyTrie<V> {
*/
private static final int STREAM_MAGIC = 0x45475452;
/**
* Minimum supported stream version constant retained for explicit range checks.
*/
private static final int MIN_STREAM_VERSION = 1;
/**
* Number of stored values for which {@link #getEntries(String)} can return an
* empty result.
*/
private static final int NO_VALUE_COUNT = 0;
/**
* Number of stored values for which {@link #getEntries(String)} can use a
* one-item immutable list special case.
*/
private static final int SINGLE_VALUE_COUNT = 1;
/**
* Binary format version.
*/
private static final int STREAM_VERSION = 5;
/**
* Version where traversal-direction ordinal is persisted.
*/
private static final int TRAVERSAL_VERSION = 2;
/**
* Version where compact reduction metadata is persisted.
*/
private static final int REDUCTION_VERSION = 3;
/**
* Version where case-processing mode ordinal is persisted.
*/
private static final int CASE_VERSION = 4;
/**
* Default dense child lookup span in code points used when materializing
* compiled nodes without an explicit override.
* <p>
* Increasing this value increases the chance of direct array indexing for
* child lookup at runtime at the cost of per-node dense table memory for
* compact character spans.
* </p>
*/
public static final int DEFAULT_MAX_EXPANDED_INDEX = 512;
/**
* Returns the current persisted binary stream format version.
*
@@ -259,7 +300,6 @@ public final class FrequencyTrie<V> {
* if the key does not exist or no value is stored at the addressed node
* @throws NullPointerException if {@code key} is {@code null}
*/
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
public List<ValueCount<V>> getEntries(final String key) {
Objects.requireNonNull(key, "key");
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
@@ -269,11 +309,11 @@ public final class FrequencyTrie<V> {
final V[] orderedValues = node.orderedValues();
final int valueCount = orderedValues.length;
if (valueCount == 0) {
if (valueCount == NO_VALUE_COUNT) {
return List.of();
}
if (valueCount == 1) {
if (valueCount == SINGLE_VALUE_COUNT) {
return List.of(new ValueCount<>(orderedValues[0], node.orderedCounts()[0]));
}
@@ -383,47 +423,31 @@ public final class FrequencyTrie<V> {
*/
public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
final ValueStreamCodec<V> valueCodec) throws IOException {
Objects.requireNonNull(inputStream, "inputStream");
Objects.requireNonNull(arrayFactory, "arrayFactory");
Objects.requireNonNull(valueCodec, "valueCodec");
final DataInputStream dataInput; // NOPMD
if (inputStream instanceof DataInputStream) {
dataInput = (DataInputStream) inputStream;
} else {
dataInput = new DataInputStream(inputStream);
return readFrom(inputStream, arrayFactory, valueCodec, -1);
}
final int magic = dataInput.readInt();
if (magic != STREAM_MAGIC) {
throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic));
}
final int version = dataInput.readInt();
if (version != 1 && version != 3 && version != 4 && version != STREAM_VERSION) {
throw new IOException("Unsupported trie stream version: " + version);
}
final int nodeCount = dataInput.readInt();
if (nodeCount < 0) {
throw new IOException("Negative node count: " + nodeCount);
}
final int rootNodeId = dataInput.readInt();
if (rootNodeId < 0 || rootNodeId >= nodeCount) {
throw new IOException("Invalid root node id: " + rootNodeId);
}
final TrieMetadata metadata = readMetadata(dataInput, version);
final CompiledNode<V>[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount);
final CompiledNode<V> rootNode = nodes[rootNodeId];
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount);
}
return new FrequencyTrie<>(arrayFactory, rootNode, metadata);
/**
* Reads a compiled trie from the supplied input stream, optionally overriding
* dense child-index span configuration.
* <p>
* This setting is applied only while materializing the in-memory compiled
* representation during load. It is not serialized in {@link TrieMetadata},
* so each load can independently choose its own runtime lookup trade-off.
* </p>
*
* @param inputStream source input stream
* @param arrayFactory array factory used to create typed arrays
* @param valueCodec codec used to read values
* @param maxExpandedIndex dense lookup span override; zero disables dense lookup,
* negative values use {@link #DEFAULT_MAX_EXPANDED_INDEX}
* @param <V> value type
* @return deserialized compiled trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if reading fails or the binary format is invalid
*/
public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
final ValueStreamCodec<V> valueCodec, final int maxExpandedIndex) throws IOException {
return CompiledTrieReader.read(inputStream, arrayFactory, valueCodec, maxExpandedIndex);
}
/**
@@ -438,73 +462,6 @@ public final class FrequencyTrie<V> {
dataOutput.writeUTF(metadata.toTextBlock());
}
/**
* Reads persisted trie metadata while remaining backward compatible with
* earlier stream versions.
*
* @param dataInput input stream
* @param version persisted stream version
* @return deserialized metadata
* @throws IOException if the metadata section is invalid
*/
private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException {
if (version >= 5) { // NOPMD
try {
return TrieMetadata.fromTextBlock(version, dataInput.readUTF());
} catch (IllegalArgumentException exception) {
throw new IOException("Invalid metadata block.", exception);
}
}
final WordTraversalDirection traversalDirection;
if (version >= 2) { // NOPMD
final int traversalDirectionOrdinal = dataInput.readInt();
final WordTraversalDirection[] traversalDirections = WordTraversalDirection.values();
if (traversalDirectionOrdinal < 0 || traversalDirectionOrdinal >= traversalDirections.length) {
throw new IOException("Invalid traversal direction ordinal: " + traversalDirectionOrdinal);
}
traversalDirection = traversalDirections[traversalDirectionOrdinal];
} else {
traversalDirection = WordTraversalDirection.BACKWARD;
}
if (version < 3) { // NOPMD
return TrieMetadata.legacy(version, traversalDirection);
}
final ReductionMode[] reductionModes = ReductionMode.values();
final int reductionModeOrdinal = dataInput.readInt();
if (reductionModeOrdinal < 0 || reductionModeOrdinal >= reductionModes.length) {
throw new IOException("Invalid reduction mode ordinal: " + reductionModeOrdinal);
}
final int dominantWinnerMinPercent = dataInput.readInt();
final int dominantWinnerOverSecondRatio = dataInput.readInt(); // NOPMD
final DiacriticProcessingMode[] diacriticProcessingModes = DiacriticProcessingMode.values();
final int diacriticProcessingModeOrdinal = dataInput.readInt(); // NOPMD
if (diacriticProcessingModeOrdinal < 0 || diacriticProcessingModeOrdinal >= diacriticProcessingModes.length) {
throw new IOException("Invalid diacritic processing mode ordinal: " + diacriticProcessingModeOrdinal);
}
final CaseProcessingMode caseProcessingMode;
if (version >= 4) { // NOPMD
final CaseProcessingMode[] caseProcessingModes = CaseProcessingMode.values();
final int caseProcessingModeOrdinal = dataInput.readInt();
if (caseProcessingModeOrdinal < 0 || caseProcessingModeOrdinal >= caseProcessingModes.length) {
throw new IOException("Invalid case processing mode ordinal: " + caseProcessingModeOrdinal);
}
caseProcessingMode = caseProcessingModes[caseProcessingModeOrdinal];
} else {
caseProcessingMode = CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
}
return new TrieMetadata(version, traversalDirection,
new ReductionSettings(reductionModes[reductionModeOrdinal], dominantWinnerMinPercent,
dominantWinnerOverSecondRatio),
diacriticProcessingModes[diacriticProcessingModeOrdinal], caseProcessingMode);
}
/**
* Returns the number of canonical compiled nodes reachable from the root.
*
@@ -574,20 +531,126 @@ public final class FrequencyTrie<V> {
}
/**
* Reads all compiled nodes and resolves child references.
* Internal helper that materializes serialized trie data.
*
* @param dataInput input
* @param arrayFactory array factory
* @param valueCodec value codec
* @param nodeCount number of nodes
* @param <V> value type
* @return array of nodes indexed by serialized node identifier
* @throws IOException if reading fails or the stream is invalid
* <p>
* Moving reader complexity into this helper keeps the public-facing class from
* accumulating excessive class-level cyclomatic complexity while preserving the
* same binary compatibility contract.
* </p>
*/
@SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops")
private static final class CompiledTrieReader {
private static <V> FrequencyTrie<V> read(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
final ValueStreamCodec<V> valueCodec, final int maxExpandedIndex) throws IOException {
Objects.requireNonNull(inputStream, "inputStream");
Objects.requireNonNull(arrayFactory, "arrayFactory");
Objects.requireNonNull(valueCodec, "valueCodec");
if (maxExpandedIndex < -1) {
throw new IllegalArgumentException("maxExpandedIndex must be >= -1.");
}
final DataInputStream dataInput = wrapInputStream(inputStream);
final int magic = dataInput.readInt();
if (magic != STREAM_MAGIC) {
throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic));
}
final int version = dataInput.readInt();
if (version < MIN_STREAM_VERSION || version > STREAM_VERSION) {
throw new IOException("Unsupported trie stream version: " + version);
}
final int nodeCount = dataInput.readInt();
if (nodeCount < 0) {
throw new IOException("Negative node count: " + nodeCount);
}
final int rootNodeId = dataInput.readInt();
if (rootNodeId < 0 || rootNodeId >= nodeCount) {
throw new IOException("Invalid root node id: " + rootNodeId);
}
final TrieMetadata sourceMetadata = readMetadata(dataInput, version);
final int effectiveMaxExpandedIndex = maxExpandedIndex >= 0 ? maxExpandedIndex : DEFAULT_MAX_EXPANDED_INDEX;
final CompiledNode<V>[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount, effectiveMaxExpandedIndex);
final CompiledNode<V> rootNode = nodes[rootNodeId];
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount);
}
return new FrequencyTrie<>(arrayFactory, rootNode, sourceMetadata);
}
private static DataInputStream wrapInputStream(final InputStream inputStream) {
return inputStream instanceof DataInputStream
? (DataInputStream) inputStream
: new DataInputStream(inputStream);
}
private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException {
if (version == STREAM_VERSION) {
return readTextMetadata(dataInput);
}
final WordTraversalDirection traversalDirection = readTraversalDirection(dataInput, version);
if (version < REDUCTION_VERSION) {
return TrieMetadata.legacy(version, traversalDirection);
}
final ReductionSettings reductionSettings = readReductionSettings(dataInput);
final DiacriticProcessingMode diacriticProcessingMode = readEnumByOrdinal(dataInput, DiacriticProcessingMode.values(),
"diacritic processing mode");
final CaseProcessingMode caseProcessingMode = version >= CASE_VERSION
? readCaseProcessingMode(dataInput)
: CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
return new TrieMetadata(version, traversalDirection, reductionSettings, diacriticProcessingMode, caseProcessingMode);
}
private static TrieMetadata readTextMetadata(final DataInputStream dataInput) throws IOException {
try {
return TrieMetadata.fromTextBlock(STREAM_VERSION, dataInput.readUTF());
} catch (IllegalArgumentException exception) {
throw new IOException("Invalid metadata block.", exception);
}
}
private static WordTraversalDirection readTraversalDirection(final DataInputStream dataInput, final int version)
throws IOException {
if (version < TRAVERSAL_VERSION) {
return WordTraversalDirection.BACKWARD;
}
return readEnumByOrdinal(dataInput, WordTraversalDirection.values(), "traversal direction");
}
private static ReductionSettings readReductionSettings(final DataInputStream dataInput) throws IOException {
final ReductionMode reductionMode = readEnumByOrdinal(dataInput, ReductionMode.values(), "reduction mode");
final int dominantWinnerMinPercent = dataInput.readInt();
final int dominantWinnerOverSecondRatio = dataInput.readInt(); // NOPMD
return new ReductionSettings(reductionMode, dominantWinnerMinPercent, dominantWinnerOverSecondRatio);
}
private static CaseProcessingMode readCaseProcessingMode(final DataInputStream dataInput) throws IOException {
return readEnumByOrdinal(dataInput, CaseProcessingMode.values(), "case processing mode");
}
private static <E extends Enum<E>> E readEnumByOrdinal(final DataInputStream dataInput, final E[] values,
final String name) throws IOException {
final int ordinal = dataInput.readInt();
if (ordinal < 0 || ordinal >= values.length) {
throw new IOException("Invalid " + name + " ordinal: " + ordinal);
}
return values[ordinal];
}
private static <V> CompiledNode<V>[] readNodes(final DataInputStream dataInput, final IntFunction<V[]> arrayFactory,
final ValueStreamCodec<V> valueCodec, final int nodeCount) throws IOException {
final List<NodeData<V>> nodeDataList = new ArrayList<>(nodeCount);
final ValueStreamCodec<V> valueCodec, final int nodeCount, final int maxExpandedIndex) throws IOException {
final char[][] edgeLabelsByNode = new char[nodeCount][];
final int[][] childNodeIdsByNode = new int[nodeCount][];
@SuppressWarnings("unchecked")
final V[][] orderedValuesByNode = (V[][]) new Object[nodeCount][];
final int[][] orderedCountsByNode = new int[nodeCount][];
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
final int edgeCount = dataInput.readInt();
@@ -595,77 +658,85 @@ public final class FrequencyTrie<V> {
throw new IOException("Negative edge count at node " + nodeIndex + ": " + edgeCount);
}
final char[] edgeLabels = new char[edgeCount];
final int[] childNodeIds = new int[edgeCount];
edgeLabelsByNode[nodeIndex] = new char[edgeCount];
childNodeIdsByNode[nodeIndex] = new int[edgeCount];
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
edgeLabels[edgeIndex] = dataInput.readChar();
childNodeIds[edgeIndex] = dataInput.readInt();
edgeLabelsByNode[nodeIndex][edgeIndex] = dataInput.readChar();
childNodeIdsByNode[nodeIndex][edgeIndex] = dataInput.readInt();
}
validateSerializedEdges(nodeIndex, edgeLabels);
validateSerializedEdges(nodeIndex, edgeLabelsByNode[nodeIndex]);
final int valueCount = dataInput.readInt();
if (valueCount < 0) {
throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
}
final V[] orderedValues = arrayFactory.apply(valueCount);
final int[] orderedCounts = new int[valueCount];
orderedValuesByNode[nodeIndex] = arrayFactory.apply(valueCount);
orderedCountsByNode[nodeIndex] = new int[valueCount];
for (int valueIndex = 0; valueIndex < valueCount; valueIndex++) {
orderedValues[valueIndex] = valueCodec.read(dataInput);
orderedCounts[valueIndex] = dataInput.readInt();
if (orderedCounts[valueIndex] <= 0) {
orderedValuesByNode[nodeIndex][valueIndex] = valueCodec.read(dataInput);
orderedCountsByNode[nodeIndex][valueIndex] = dataInput.readInt();
if (orderedCountsByNode[nodeIndex][valueIndex] <= 0) {
throw new IOException("Non-positive stored count at node " + nodeIndex + ", value index "
+ valueIndex + ": " + orderedCounts[valueIndex]);
+ valueIndex + ": " + orderedCountsByNode[nodeIndex][valueIndex]);
}
}
nodeDataList.add(new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts));
}
@SuppressWarnings("unchecked")
final CompiledNode<V>[] nodes = new CompiledNode[nodeCount];
final boolean[] inProgress = new boolean[nodeCount];
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
final NodeData<V> nodeData = nodeDataList.get(nodeIndex);
@SuppressWarnings("unchecked")
final CompiledNode<V>[] children = new CompiledNode[nodeData.childNodeIds().length];
nodes[nodeIndex] = new CompiledNode<>(nodeData.edgeLabels(), children, nodeData.orderedValues(),
nodeData.orderedCounts());
}
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
final NodeData<V> nodeData = nodeDataList.get(nodeIndex);
final CompiledNode<V> node = nodes[nodeIndex];
for (int edgeIndex = 0; edgeIndex < nodeData.childNodeIds().length; edgeIndex++) {
final int childNodeId = nodeData.childNodeIds()[edgeIndex];
if (childNodeId < 0 || childNodeId >= nodeCount) {
throw new IOException("Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex
+ ": " + childNodeId);
}
node.children()[edgeIndex] = nodes[childNodeId];
}
nodes[nodeIndex] = resolveNode(nodeIndex, edgeLabelsByNode, childNodeIdsByNode, orderedValuesByNode,
orderedCountsByNode, nodes, inProgress, maxExpandedIndex);
}
return nodes;
}
/**
* Validates the serialized edge-label sequence for one node.
*
* <p>
* Compiled nodes rely on binary search for child lookup and therefore require
* edge labels to be stored in strict ascending order without duplicates.
* Rejecting malformed streams here keeps lookup semantics deterministic and
* avoids silently constructing a trie whose search behavior would be undefined.
*
* @param nodeIndex serialized node identifier
* @param edgeLabels serialized edge labels
* @throws IOException if the edge labels are not strictly ascending
*/
private static <V> CompiledNode<V> resolveNode(final int nodeIndex, final char[][] edgeLabelsByNode,
final int[][] childNodeIdsByNode, final V[][] orderedValuesByNode, final int[][] orderedCountsByNode,
final CompiledNode<V>[] nodes, final boolean[] inProgress, final int maxExpandedIndex) throws IOException {
final CompiledNode<V> cachedNode = nodes[nodeIndex];
if (cachedNode != null) {
return cachedNode;
}
if (inProgress[nodeIndex]) {
throw new IOException("Invalid serialized node graph: cyclic reference detected at node " + nodeIndex + '.');
}
inProgress[nodeIndex] = true;
try {
final char[] edgeLabels = edgeLabelsByNode[nodeIndex];
final int[] childNodeIds = childNodeIdsByNode[nodeIndex];
final int edgeCount = childNodeIds.length;
@SuppressWarnings("unchecked")
final CompiledNode<V>[] children = new CompiledNode[edgeCount];
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
final int childNodeId = childNodeIds[edgeIndex];
if (childNodeId < 0 || childNodeId >= edgeLabelsByNode.length) {
throw new IOException(
"Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex + ": "
+ childNodeId);
}
children[edgeIndex] = resolveNode(childNodeId, edgeLabelsByNode, childNodeIdsByNode,
orderedValuesByNode, orderedCountsByNode, nodes, inProgress, maxExpandedIndex);
}
final CompiledNode<V> node = new CompiledNode<>(edgeLabels, children, orderedValuesByNode[nodeIndex], maxExpandedIndex,
orderedCountsByNode[nodeIndex]);
nodes[nodeIndex] = node;
return node;
} finally {
inProgress[nodeIndex] = false;
}
}
private static void validateSerializedEdges(final int nodeIndex, final char... edgeLabels) throws IOException {
for (int edgeIndex = 1; edgeIndex < edgeLabels.length; edgeIndex++) {
if (edgeLabels[edgeIndex - 1] >= edgeLabels[edgeIndex]) {
@@ -674,6 +745,7 @@ public final class FrequencyTrie<V> {
}
}
}
}
/**
* Locates the compiled node for the supplied key.
@@ -771,6 +843,16 @@ public final class FrequencyTrie<V> {
*/
private final DiacriticProcessingMode diacriticProcessingMode;
/**
* Dense edge lookup span threshold.
* <p>
* This value controls a speed/memory trade-off during freezing:
* dense child lookup tables are allocated only for nodes whose child
* labels fit in this span.
* </p>
*/
private final int maxExpandedIndex;
/**
* Mutable root node.
*/
@@ -837,11 +919,39 @@ public final class FrequencyTrie<V> {
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
final DiacriticProcessingMode diacriticProcessingMode) {
this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, diacriticProcessingMode,
CompiledNode.DEFAULT_MAX_EXPANDED_INDEX);
}
/**
* Creates a new builder with the provided settings, explicit traversal
* direction, explicit case processing mode, explicit diacritic processing
* mode, and an explicit dense child lookup threshold.
*
* @param arrayFactory array factory
* @param reductionSettings reduction configuration
* @param traversalDirection logical key traversal direction
* @param caseProcessingMode dictionary case processing mode
* @param diacriticProcessingMode dictionary diacritic processing mode
* @param maxExpandedIndex dense lookup span override; zero disables
* dense lookup. Larger values increase direct
* indexing opportunities while potentially
* increasing materialization memory in nodes
* whose edge label span is within the limit.
* @throws NullPointerException if any argument is {@code null}
*/
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
final DiacriticProcessingMode diacriticProcessingMode, final int maxExpandedIndex) {
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
if (maxExpandedIndex < 0) {
throw new IllegalArgumentException("maxExpandedIndex must be non-negative.");
}
this.maxExpandedIndex = maxExpandedIndex;
this.root = new MutableNode<>();
}
@@ -1098,7 +1208,7 @@ public final class FrequencyTrie<V> {
}
final CompiledNode<V> frozen = new CompiledNode<>(edges, childNodes, localSummary.orderedValues(),
localSummary.orderedCounts());
this.maxExpandedIndex, localSummary.orderedCounts());
cache.put(reducedNode, frozen);
return frozen;
}

View File

@@ -94,6 +94,29 @@ public final class StemmerPatchTrieBinaryIO {
}
}
/**
* Reads a GZip-compressed binary patch-command trie from a filesystem path
* with an optional dense child lookup span override.
* <p>
* This is a runtime-only tuning parameter. The dense-span setting is not
* persisted in the file and does not change the compiled metadata.
* </p>
*
* @param path source file
* @param maxExpandedIndex dense lookup span override; negative values use
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
* @return deserialized trie
* @throws NullPointerException if {@code path} is {@code null}
* @throws IOException if reading or decompression fails
*/
public static FrequencyTrie<String> read(final Path path, final int maxExpandedIndex) throws IOException {
Objects.requireNonNull(path, "path");
try (InputStream fileInputStream = Files.newInputStream(path)) {
return read(fileInputStream, maxExpandedIndex);
}
}
/**
* Reads a GZip-compressed binary patch-command trie from a filesystem path
* string.
@@ -108,6 +131,26 @@ public final class StemmerPatchTrieBinaryIO {
return read(Path.of(fileName));
}
/**
* Reads a GZip-compressed binary patch-command trie from a filesystem path
* string with an optional dense child lookup span override.
* <p>
* This is a runtime-only tuning parameter. The dense-span setting is not
* persisted in the file and does not change the compiled metadata.
* </p>
*
* @param fileName source file name or path string
* @param maxExpandedIndex dense lookup span override; negative values use
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
* @return deserialized trie
* @throws NullPointerException if {@code fileName} is {@code null}
* @throws IOException if reading or decompression fails
*/
public static FrequencyTrie<String> read(final String fileName, final int maxExpandedIndex) throws IOException {
Objects.requireNonNull(fileName, "fileName");
return read(Path.of(fileName), maxExpandedIndex);
}
/**
* Reads a GZip-compressed binary patch-command trie from an input stream.
*
@@ -132,6 +175,34 @@ public final class StemmerPatchTrieBinaryIO {
}
}
/**
* Reads a GZip-compressed binary patch-command trie from an input stream with
* an optional dense child lookup span override.
* <p>
* This is a runtime-only tuning parameter. The dense-span setting is not
* persisted in the file and does not change the compiled metadata.
* </p>
*
* @param inputStream source stream
* @param maxExpandedIndex dense lookup span override; negative values use
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
* @return deserialized trie
* @throws NullPointerException if {@code inputStream} is {@code null}
* @throws IOException if reading or decompression fails
*/
public static FrequencyTrie<String> read(final InputStream inputStream, final int maxExpandedIndex) throws IOException {
Objects.requireNonNull(inputStream, "inputStream");
try (GZIPInputStream gzipInputStream = new GZIPInputStream(new BufferedInputStream(inputStream));
DataInputStream dataInputStream = new DataInputStream(gzipInputStream)) {
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(dataInputStream, String[]::new, STRING_CODEC,
maxExpandedIndex);
LOGGER.log(Level.FINE, "Read compressed binary stemmer trie.");
return trie;
}
}
/**
* Reads only metadata from a GZip-compressed binary patch-command trie stored
* at a filesystem path.

View File

@@ -71,6 +71,7 @@ import java.util.zip.GZIPInputStream;
public final class StemmerPatchTrieLoader {
/* default */ static final String FILENAME_REQUIRED = "fileName required";
private static final String PARAMETER_PATH = "path";
/**
* Logger of this class.
@@ -461,7 +462,7 @@ public final class StemmerPatchTrieLoader {
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
throws IOException {
Objects.requireNonNull(path, "path");
Objects.requireNonNull(path, PARAMETER_PATH);
final TrieMetadata metadata = metadataForCompilation(traversalDirection, reductionSettings, caseProcessingMode,
diacriticProcessingMode);
return load(path, storeOriginal, metadata);
@@ -487,7 +488,7 @@ public final class StemmerPatchTrieLoader {
*/
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal, final TrieMetadata metadata)
throws IOException {
Objects.requireNonNull(path, "path");
Objects.requireNonNull(path, PARAMETER_PATH);
Objects.requireNonNull(metadata, "metadata");
try (InputStream inputStream = openDictionaryInputStream(path);
@@ -759,10 +760,31 @@ public final class StemmerPatchTrieLoader {
* read
*/
public static FrequencyTrie<String> loadBinary(final Path path) throws IOException {
Objects.requireNonNull(path, "path");
Objects.requireNonNull(path, PARAMETER_PATH);
return StemmerPatchTrieBinaryIO.read(path);
}
/**
* Loads a GZip-compressed binary patch-command trie from a filesystem path
* using a custom dense lookup span override.
* <p>
* This is a runtime-only tuning parameter that does not affect persisted
* metadata.
* </p>
*
* @param path path to the compressed binary trie file
* @param maxExpandedIndex dense lookup span override; negative values use
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
* @return compiled patch-command trie
* @throws NullPointerException if {@code path} is {@code null}
* @throws IOException if the file cannot be opened, decompressed, or
* read
*/
public static FrequencyTrie<String> loadBinary(final Path path, final int maxExpandedIndex) throws IOException {
Objects.requireNonNull(path, PARAMETER_PATH);
return StemmerPatchTrieBinaryIO.read(path, maxExpandedIndex);
}
/**
* Loads a GZip-compressed binary patch-command trie from a filesystem path
* string.
@@ -778,6 +800,27 @@ public final class StemmerPatchTrieLoader {
return StemmerPatchTrieBinaryIO.read(fileName);
}
/**
* Loads a GZip-compressed binary patch-command trie from a filesystem path
* string using a custom dense lookup span override.
* <p>
* This is a runtime-only tuning parameter that does not affect persisted
* metadata.
* </p>
*
* @param fileName file name or path string
* @param maxExpandedIndex dense lookup span override; negative values use
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
* @return compiled patch-command trie
* @throws NullPointerException if {@code fileName} is {@code null}
* @throws IOException if the file cannot be opened, decompressed, or
* read
*/
public static FrequencyTrie<String> loadBinary(final String fileName, final int maxExpandedIndex) throws IOException {
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
return StemmerPatchTrieBinaryIO.read(fileName, maxExpandedIndex);
}
/**
* Loads a GZip-compressed binary patch-command trie from an input stream.
*
@@ -802,7 +845,7 @@ public final class StemmerPatchTrieLoader {
* read
*/
public static TrieMetadata loadBinaryMetadata(final Path path) throws IOException {
Objects.requireNonNull(path, "path");
Objects.requireNonNull(path, PARAMETER_PATH);
return StemmerPatchTrieBinaryIO.readMetadata(path);
}
@@ -845,7 +888,7 @@ public final class StemmerPatchTrieLoader {
*/
public static void saveBinary(final FrequencyTrie<String> trie, final Path path) throws IOException {
Objects.requireNonNull(trie, "trie");
Objects.requireNonNull(path, "path");
Objects.requireNonNull(path, PARAMETER_PATH);
StemmerPatchTrieBinaryIO.write(trie, path);
}

View File

@@ -44,13 +44,14 @@ import java.util.Objects;
* arrays once and all lookup operations thereafter treat them as read-only.
*
* @param <V> value type
* @param edgeLabels internal edge label array
* @param children internal child array
* @param orderedValues internal ordered values array
* @param orderedCounts internal ordered counts array
*/
@SuppressWarnings("PMD.DataClass")
public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[] orderedValues, int... orderedCounts) {
public final class CompiledNode<V> {
/**
* Default dense child lookup span in characters used when an explicit override is
* not provided.
*/
public static final int DEFAULT_MAX_EXPANDED_INDEX = 512;
/**
* Number of child edges where linear scan is cheaper than binary search.
@@ -58,24 +59,112 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
private static final int LINEAR_CHILD_COUNT_THRESHOLD = 4;
/**
* Creates one validated compiled node.
* Edge labels in sorted ascending order.
*/
private final char[] edgeLabels;
/**
* Sparse child array aligned with {@link #edgeLabels}.
*/
private final CompiledNode<V>[] children;
/**
* Dense child lookup table used when labels fit into a compact char interval.
* <p>
* The table enables direct O(1) indexing for child lookup and is allocated
* only when the character span of this node's edges is within the configured
* threshold.
* </p>
*/
private final CompiledNode<V>[] denseChildren;
/**
* Normalized minimum edge value for the dense lookup table.
*/
private final int denseEdgeMin;
/**
* Values stored at this node in local order.
*/
private final V[] orderedValues;
/**
* Occurrence counts aligned with {@link #orderedValues}.
*/
private final int[] orderedCounts;
/**
* Creates one validated compiled node using {@link #DEFAULT_MAX_EXPANDED_INDEX}
* for dense lookup sizing.
*
* @throws NullPointerException if any array argument is {@code null}
* @throws IllegalArgumentException if the edge-related arrays or value-related
* arrays do not have matching lengths
*/
public CompiledNode {
public CompiledNode(final char[] edgeLabels, final CompiledNode<V>[] children, final V[] orderedValues,
final int... orderedCounts) {
this(edgeLabels, children, orderedValues, DEFAULT_MAX_EXPANDED_INDEX, orderedCounts);
}
/**
* Creates one validated compiled node.
*
* @param maxExpandedIndex upper bound for the dense lookup interval size; zero
* disables dense lookup. Larger values improve
* direct-index likelihood while increasing dense
* table memory in compact-label nodes.
* @throws NullPointerException if any array argument is {@code null}
* @throws IllegalArgumentException if the edge-related arrays or value-related
* arrays do not have matching lengths or the
* dense interval size is negative
*/
public CompiledNode(final char[] edgeLabels, final CompiledNode<V>[] children, final V[] orderedValues,
final int maxExpandedIndex, final int... orderedCounts) {
Objects.requireNonNull(edgeLabels, "edgeLabels");
Objects.requireNonNull(children, "children");
Objects.requireNonNull(orderedValues, "orderedValues");
Objects.requireNonNull(orderedCounts, "orderedCounts");
if (maxExpandedIndex < 0) {
throw new IllegalArgumentException("maxExpandedIndex must be non-negative.");
}
if (edgeLabels.length != children.length) {
throw new IllegalArgumentException("edgeLabels and children must have the same length.");
}
if (orderedValues.length != orderedCounts.length) {
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
}
this.edgeLabels = edgeLabels;
this.children = children;
this.orderedValues = orderedValues;
this.orderedCounts = orderedCounts;
if (edgeLabels.length == 0 || maxExpandedIndex == 0) {
this.denseChildren = null;
this.denseEdgeMin = 0;
return;
}
final int minEdge = edgeLabels[0];
final int maxEdge = edgeLabels[edgeLabels.length - 1];
final int span = maxEdge - minEdge;
if (span < 0 || span > maxExpandedIndex) {
this.denseChildren = null;
this.denseEdgeMin = 0;
return;
}
@SuppressWarnings("unchecked")
final CompiledNode<V>[] dense = (CompiledNode<V>[]) new CompiledNode[span + 1];
for (int edgeIndex = 0; edgeIndex < edgeLabels.length; edgeIndex++) {
dense[edgeLabels[edgeIndex] - minEdge] = children[edgeIndex];
}
this.denseChildren = dense;
this.denseEdgeMin = minEdge;
}
/**
@@ -87,7 +176,6 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
*
* @return internal edge-label array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public char[] edgeLabels() {
return this.edgeLabels;
@@ -102,7 +190,6 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
*
* @return internal child-node array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public CompiledNode<V>[] children() {
return this.children;
@@ -117,7 +204,6 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
*
* @return internal ordered-values array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public V[] orderedValues() {
return this.orderedValues;
@@ -132,14 +218,143 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
*
* @return internal ordered-counts array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public int[] orderedCounts() {
return this.orderedCounts;
}
/**
* Returns the number of child edges represented by this node.
*
* @return child edge count
*/
public int edgeCount() {
return this.edgeLabels.length;
}
/**
* Returns the number of values stored in this node.
*
* @return value count
*/
public int valueCount() {
return this.orderedValues.length;
}
/**
* Indicates whether this node stores any values.
*
* @return {@code true} when values are present at this node
*/
public boolean hasValues() {
return this.orderedValues.length > 0;
}
/**
* Indicates whether this node has child edges.
*
* @return {@code true} when this node has at least one outgoing edge
*/
public boolean hasChildren() {
return this.edgeLabels.length > 0;
}
/**
* Indicates whether this node has no child edges.
*
* @return {@code true} when this node is a terminal leaf node
*/
public boolean isLeaf() {
return !hasChildren();
}
/**
* Tests whether an edge label is present at this node.
*
* @param edge edge label
* @return {@code true} if this node contains the supplied edge label
*/
public boolean hasEdge(final char edge) {
return findChild(edge) != null;
}
/**
* Indicates whether this node has a dense direct-index child lookup table.
*
* @return {@code true} when a direct-index child table is available
*/
public boolean hasDenseLookup() {
return this.denseChildren != null;
}
/**
* Returns a small memory-related metric describing this node's dense table size.
*
* @return number of dense table slots, or {@code 0} when dense lookup is not
* enabled
*/
public int denseTableLength() {
return this.denseChildren == null ? 0 : this.denseChildren.length;
}
/**
* Returns a compact structural summary used by diagnostics and tests.
*
* @return summary hash for node structure and contents
*/
@Override
public int hashCode() {
int hash = Arrays.hashCode(this.edgeLabels);
hash = 31 * hash + Arrays.hashCode(this.children);
hash = 31 * hash + Arrays.hashCode(this.orderedValues);
hash = 31 * hash + Arrays.hashCode(this.orderedCounts);
hash = 31 * hash + Objects.hash(this.denseEdgeMin);
hash = 31 * hash + (hasDenseLookup() ? Arrays.hashCode(this.denseChildren) : 0);
return hash;
}
/**
* Compares structural node content, including dense table availability.
*
* @param object comparison object
* @return {@code true} when nodes describe identical structure and payload
*/
@Override
public boolean equals(final Object object) {
if (this == object) {
return true;
}
if (!(object instanceof CompiledNode<?> other)) {
return false;
}
return Arrays.equals(this.edgeLabels, other.edgeLabels) && Arrays.equals(this.children, other.children)
&& Arrays.equals(this.orderedValues, other.orderedValues) && Arrays.equals(this.orderedCounts, other.orderedCounts)
&& this.denseEdgeMin == other.denseEdgeMin && Arrays.equals(this.denseChildren, other.denseChildren);
}
/**
* Returns a short summary useful for debugging and diagnostics.
*
* @return textual node summary
*/
@Override
public String toString() {
return "CompiledNode{"
+ "edgeCount=" + this.edgeLabels.length + ", orderedValueCount=" + this.orderedValues.length
+ ", denseTableLength=" + denseTableLength() + '}';
}
/**
* Finds a child for the supplied edge character.
* <p>
* Lookup order is:
* <ol>
* <li>dense array index (if the label interval is compact enough),</li>
* <li>small-child linear scan when the fallback node has {@value #LINEAR_CHILD_COUNT_THRESHOLD}
* or fewer edges,</li>
* <li>binary search over sorted labels.</li>
* </ol>
* </p>
*
* @param edge edge character
* @return child node, or {@code null} if absent
@@ -149,6 +364,15 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
if (childCount == 0) {
return null;
}
if (this.denseChildren != null) {
final int denseIndex = edge - this.denseEdgeMin;
if (denseIndex < 0 || denseIndex >= this.denseChildren.length) {
return null;
}
return this.denseChildren[denseIndex];
}
if (childCount <= LINEAR_CHILD_COUNT_THRESHOLD) {
for (int index = 0; index < childCount; index++) {
if (this.edgeLabels[index] == edge) {

View File

@@ -95,6 +95,8 @@ import org.junit.jupiter.params.provider.MethodSource;
@Tag("integration")
@Tag("cli")
@Tag("stemmer")
@Tag("compile")
@Tag("slow")
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
@DisplayName("Compile integration")
final class CompileIntegrationTest {
@@ -192,6 +194,7 @@ final class CompileIntegrationTest {
* @throws IOException if reading or writing fails
*/
@Test
@Tag("slow")
@DisplayName("CLI should compile the remark-aware fixture and preserve expected lookups")
void shouldCompileRemarkAwareFixtureAndPreserveExpectedLookups() throws IOException {
final Path inputFile = copyResourceToTemporaryFile(REMARK_AWARE_DICTIONARY_RESOURCE,
@@ -237,6 +240,7 @@ final class CompileIntegrationTest {
* @throws IOException if reading or writing fails
*/
@Test
@Tag("slow")
@DisplayName("CLI should require overwrite before replacing an existing output artifact")
void shouldRequireOverwriteForExistingOutput() throws IOException {
final Path inputFile = copyResourceToTemporaryFile(REMARK_AWARE_DICTIONARY_RESOURCE,
@@ -301,6 +305,7 @@ final class CompileIntegrationTest {
@Nested
@DisplayName("Bundled project dictionary workflows")
@Tag("slow")
final class BundledProjectDictionaryWorkflows {
/**
@@ -322,6 +327,7 @@ final class CompileIntegrationTest {
*/
@ParameterizedTest(name = "[{index}] {0}")
@MethodSource("org.egothor.stemmer.CompileIntegrationTest#bundledDictionaryCases")
@Tag("slow")
@DisplayName("CLI should compile bundled project dictionaries and preserve representative variant semantics")
void shouldCompileBundledProjectDictionaryAndPreserveRepresentativeVariantSemantics(final String scenario,
final String resourcePath) throws IOException {

View File

@@ -66,7 +66,10 @@ import org.junit.jupiter.api.io.TempDir;
* {@link System#exit(int)}.
* </p>
*/
@Tag("unit")
@Tag("integration")
@Tag("cli")
@Tag("compile")
@Tag("stemmer")
@DisplayName("Compile")
class CompileTest {

View File

@@ -70,10 +70,11 @@ import org.junit.jupiter.params.provider.MethodSource;
* <li>compressed artifact reproducibility within the active format version</li>
* </ul>
*/
@Tag("unit")
@Tag("compat")
@Tag("regression")
@Tag("determinism")
@Tag("serialization")
@Tag("trie")
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
final class CompiledTrieArtifactRegressionTest {

View File

@@ -41,7 +41,8 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link DiacriticStripper}.
*/
@Tag("unit")
@Tag("diacritics")
@Tag("diacritic")
@Tag("stemmer")
@DisplayName("DiacriticStripper")
class DiacriticStripperTest {

View File

@@ -59,7 +59,7 @@ import org.junit.jupiter.api.Test;
*/
@DisplayName("FrequencyTrieBuilders")
@Tag("unit")
@Tag("builder")
@Tag("construction")
@Tag("frequency-trie")
class FrequencyTrieBuildersTest {

View File

@@ -47,7 +47,7 @@ import java.util.List;
import net.jqwik.api.ForAll;
import net.jqwik.api.Label;
import net.jqwik.api.Property;
import net.jqwik.api.Tag;
import org.junit.jupiter.api.Tag;
/**
* Property-based tests for the compiled trie abstraction.
@@ -59,9 +59,9 @@ import net.jqwik.api.Tag;
* core algorithm without overfitting to particular fixture data.
*/
@Label("FrequencyTrie properties")
@Tag("unit")
@Tag("property")
@Tag("trie")
@Tag("frequency-trie")
class FrequencyTrieProperties extends PropertyBasedTestSupport {
/**

View File

@@ -33,6 +33,7 @@ package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertSame;
@@ -379,6 +380,24 @@ class FrequencyTrieTest {
assertThrows(UnsupportedOperationException.class, () -> entries.add(new ValueCount<String>("z", 1)));
}
/**
* Verifies that {@link FrequencyTrie#getEntries(String)} short-circuits to a one-item immutable list.
*/
@Test
@DisplayName("getEntries returns a one-item list for single stored values")
void getEntriesReturnsSingleItemListForSingleStoredValue() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("gamma", "only");
final FrequencyTrie<String> trie = builder.build();
final List<ValueCount<String>> entries = trie.getEntries("gamma");
assertAll(() -> assertEquals(List.of(new ValueCount<String>("only", 1)), entries),
() -> assertThrows(UnsupportedOperationException.class, () -> entries.add(new ValueCount<String>("z", 1))));
}
/**
* Verifies that equal frequencies prefer the shorter string representation.
*/
@@ -755,6 +774,115 @@ class FrequencyTrieTest {
.readFrom(new ByteArrayInputStream(serializedEmptyTrie), String[]::new, null)));
}
/**
* Verifies that reading a compiled trie with a negative max-expanded override
* smaller than -1 is rejected.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom rejects invalid maxExpandedIndex override")
void readFromRejectsInvalidMaxExpandedIndexOverride() {
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
dataOutput.writeInt(0);
dataOutput.writeInt(0);
} });
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC, -2));
assertEquals("maxExpandedIndex must be >= -1.", exception.getMessage());
}
/**
* Verifies that the max-expanded override controls dense lookup materialization
* while preserving lookup semantics.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom respects dense lookup max-expanded index override")
void readFromRespectsDenseLookupMaxExpandedIndexOverride() throws IOException {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("a", "a");
builder.put("b", "b");
builder.put("c", "c");
builder.put("d", "d");
final FrequencyTrie<String> original = builder.build();
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
original.writeTo(outputStream, STRING_CODEC);
final byte[] serializedTrie = outputStream.toByteArray();
final FrequencyTrie<String> defaultDense = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie), String[]::new,
STRING_CODEC);
final FrequencyTrie<String> defaultDenseByNegative = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie),
String[]::new, STRING_CODEC, -1);
final FrequencyTrie<String> disabledDense = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie), String[]::new,
STRING_CODEC, 0);
assertAll(
() -> assertTrue(defaultDense.root().hasDenseLookup(),
"Default read should enable dense lookup for compact first-level edges."),
() -> assertTrue(defaultDenseByNegative.root().hasDenseLookup(),
"Negative override should use the default dense lookup span."),
() -> assertFalse(disabledDense.root().hasDenseLookup(),
"Zero override should disable dense lookup tables."),
() -> assertEquals(original.get("a"), disabledDense.get("a")),
() -> assertEquals(original.get("b"), disabledDense.get("b")),
() -> assertEquals(original.get("c"), disabledDense.get("c")),
() -> assertEquals(original.get("d"), disabledDense.get("d")),
() -> assertEquals(original.get("z"), disabledDense.get("z")));
}
/**
* Verifies that cyclic serialized node references are rejected as invalid
* serialization.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom rejects cyclic serialized node references")
void readFromRejectsCyclicSerializedNodeReferences() {
final byte[] bytes = createSerializedStream(0x45475452, 1, 2, 0, new NodeWriter[] {
dataOutput -> {
dataOutput.writeInt(1);
dataOutput.writeChar('b');
dataOutput.writeInt(1);
dataOutput.writeInt(0);
},
dataOutput -> {
dataOutput.writeInt(1);
dataOutput.writeChar('a');
dataOutput.writeInt(0);
dataOutput.writeInt(0);
} });
final IOException exception = assertThrows(IOException.class,
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
assertTrue(exception.getMessage().contains("cyclic reference detected"));
}
/**
* Verifies that child node references outside the valid serialized range are
* rejected.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom rejects invalid child node identifiers")
void readFromRejectsInvalidChildNodeId() {
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
dataOutput.writeInt(1);
dataOutput.writeChar('a');
dataOutput.writeInt(3);
dataOutput.writeInt(0);
} });
final IOException exception = assertThrows(IOException.class,
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
assertTrue(exception.getMessage().contains("Invalid child node id"));
}
/**
* Verifies that deserialization rejects an invalid stream magic header.
*/
@@ -785,6 +913,27 @@ class FrequencyTrieTest {
assertTrue(exception.getMessage().contains("Unsupported trie stream version"));
}
/**
* Verifies that the latest stream version validates textual metadata blocks.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom rejects invalid textual metadata block")
void readFromRejectsInvalidTextualMetadataBlock() {
final int version = FrequencyTrie.currentFormatVersion();
final byte[] bytes = createSerializedStream(0x45475452, version, 1, 0, dataOutput -> {
dataOutput.writeUTF("not valid metadata");
}, new NodeWriter[] { dataOutput -> {
dataOutput.writeInt(0);
dataOutput.writeInt(0);
} });
final IOException exception = assertThrows(IOException.class,
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
assertTrue(exception.getMessage().contains("Invalid metadata block"));
}
/**
* Verifies that deserialization rejects a negative node count.
*/
@@ -862,6 +1011,129 @@ class FrequencyTrieTest {
assertTrue(exception.getMessage().contains("Non-positive stored count"));
}
/**
* Verifies that legacy version 1 metadata uses compatibility defaults.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom supports legacy version 1 metadata")
void readFromSupportsLegacyVersionOneMetadata() throws IOException {
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
dataOutput.writeInt(0);
dataOutput.writeInt(0);
} });
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
assertEquals(TrieMetadata.legacy(1, WordTraversalDirection.BACKWARD), trie.metadata());
}
/**
* Verifies that legacy version 2 metadata stores traversal direction and uses
* compatibility defaults for other values.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom supports legacy version 2 metadata")
void readFromSupportsLegacyVersionTwoMetadata() throws IOException {
final byte[] bytes = createSerializedStream(0x45475452, 2, 1, 0,
dataOutput -> dataOutput.writeInt(WordTraversalDirection.FORWARD.ordinal()), new NodeWriter[] { dataOutput -> {
dataOutput.writeInt(0);
dataOutput.writeInt(0);
} });
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
assertEquals(TrieMetadata.legacy(2, WordTraversalDirection.FORWARD), trie.metadata());
}
/**
* Verifies that version 3 metadata includes reduction and diacritic
* processing settings.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom parses version 3 metadata")
void readFromParsesVersionThreeMetadata() throws IOException {
final ReductionSettings reductionSettings = new ReductionSettings(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS, 81, 4);
final byte[] bytes = createSerializedStream(0x45475452, 3, 1, 0,
dataOutput -> {
dataOutput.writeInt(WordTraversalDirection.BACKWARD.ordinal());
dataOutput.writeInt(reductionSettings.reductionMode().ordinal());
dataOutput.writeInt(reductionSettings.dominantWinnerMinPercent());
dataOutput.writeInt(reductionSettings.dominantWinnerOverSecondRatio());
dataOutput.writeInt(DiacriticProcessingMode.REMOVE.ordinal());
},
new NodeWriter[] { dataOutput -> {
dataOutput.writeInt(0);
dataOutput.writeInt(0);
} });
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
final TrieMetadata metadata = trie.metadata();
assertAll(() -> assertEquals(3, metadata.formatVersion()),
() -> assertEquals(WordTraversalDirection.BACKWARD, metadata.traversalDirection()),
() -> assertEquals(reductionSettings, metadata.reductionSettings()),
() -> assertEquals(DiacriticProcessingMode.REMOVE, metadata.diacriticProcessingMode()),
() -> assertEquals(CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, metadata.caseProcessingMode()));
}
/**
* Verifies that version 4 metadata additionally stores case-processing mode.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom parses version 4 case processing metadata")
void readFromParsesVersionFourCaseMetadata() throws IOException {
final ReductionSettings reductionSettings = new ReductionSettings(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, 75, 3);
final byte[] bytes = createSerializedStream(0x45475452, 4, 1, 0,
dataOutput -> {
dataOutput.writeInt(WordTraversalDirection.FORWARD.ordinal());
dataOutput.writeInt(reductionSettings.reductionMode().ordinal());
dataOutput.writeInt(reductionSettings.dominantWinnerMinPercent());
dataOutput.writeInt(reductionSettings.dominantWinnerOverSecondRatio());
dataOutput.writeInt(DiacriticProcessingMode.AS_IS.ordinal());
dataOutput.writeInt(CaseProcessingMode.AS_IS.ordinal());
},
new NodeWriter[] { dataOutput -> {
dataOutput.writeInt(0);
dataOutput.writeInt(0);
} });
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
final TrieMetadata metadata = trie.metadata();
assertAll(() -> assertEquals(4, metadata.formatVersion()),
() -> assertEquals(WordTraversalDirection.FORWARD, metadata.traversalDirection()),
() -> assertEquals(reductionSettings, metadata.reductionSettings()),
() -> assertEquals(DiacriticProcessingMode.AS_IS, metadata.diacriticProcessingMode()),
() -> assertEquals(CaseProcessingMode.AS_IS, metadata.caseProcessingMode()));
}
/**
* Verifies that invalid legacy metadata ordinals are rejected by validation.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom rejects invalid metadata ordinal in legacy stream")
void readFromRejectsInvalidLegacyMetadataOrdinal() {
final byte[] bytes = createSerializedStream(0x45475452, 2, 1, 0,
dataOutput -> dataOutput.writeInt(999), new NodeWriter[] { dataOutput -> {
dataOutput.writeInt(0);
dataOutput.writeInt(0);
} });
final IOException exception = assertThrows(IOException.class,
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
assertTrue(exception.getMessage().contains("Invalid traversal direction ordinal"));
}
/**
* Writes one node body into a synthetic serialized trie stream.
*/
@@ -889,6 +1161,24 @@ class FrequencyTrieTest {
*/
private static byte[] createSerializedStream(final int magic, final int version, final int nodeCount,
final int rootNodeId, final NodeWriter[] nodes) {
return createSerializedStream(magic, version, nodeCount, rootNodeId, dataOutput -> {
// legacy and text-based versions write their metadata differently.
}, nodes);
}
/**
* Writes a synthetic serialized trie stream with a metadata writer hook.
*
* @param magic stream magic
* @param version stream version
* @param nodeCount declared node count
* @param rootNodeId declared root node identifier
* @param metadata version-specific metadata writer
* @param nodes node body writers
* @return serialized bytes
*/
private static byte[] createSerializedStream(final int magic, final int version, final int nodeCount,
final int rootNodeId, final MetadataWriter metadata, final NodeWriter[] nodes) {
try {
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
final DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream);
@@ -897,6 +1187,7 @@ class FrequencyTrieTest {
dataOutputStream.writeInt(version);
dataOutputStream.writeInt(nodeCount);
dataOutputStream.writeInt(rootNodeId);
metadata.write(dataOutputStream);
for (NodeWriter node : nodes) {
node.write(dataOutputStream);
@@ -908,4 +1199,19 @@ class FrequencyTrieTest {
throw new IllegalStateException("Unexpected I/O while building synthetic trie stream.", exception);
}
}
/**
* Writes one synthetic metadata block.
*/
@FunctionalInterface
private interface MetadataWriter {
/**
* Writes metadata bytes for one stream version.
*
* @param dataOutput output stream
* @throws IOException if writing fails
*/
void write(DataOutputStream dataOutput) throws IOException;
}
}

View File

@@ -65,10 +65,9 @@ import org.junit.jupiter.api.io.TempDir;
* stems declared by the source dictionary.
*/
@DisplayName("Deterministic fuzz-style trie and stemmer compilation")
@Tag("unit")
@Tag("fuzz")
@Tag("trie")
@Tag("stemming")
@Tag("stemmer")
class FuzzStemmerAndTrieCompilationTest {
/**

View File

@@ -36,7 +36,7 @@ import static org.junit.jupiter.api.Assertions.assertNotNull;
import net.jqwik.api.ForAll;
import net.jqwik.api.Label;
import net.jqwik.api.Property;
import net.jqwik.api.Tag;
import org.junit.jupiter.api.Tag;
/**
* Property-based tests for {@link PatchCommandEncoder}.
@@ -47,9 +47,9 @@ import net.jqwik.api.Tag;
* reconstruct the exact requested target.
*/
@Label("PatchCommandEncoder properties")
@Tag("unit")
@Tag("property")
@Tag("patch")
@Tag("stemmer")
class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
/**

View File

@@ -241,7 +241,7 @@ class PatchCommandEncoderTest {
*/
@Nested
@DisplayName("construction")
@Tag("constructor")
@Tag("construction")
class ConstructionTests {
/**
@@ -326,7 +326,7 @@ class PatchCommandEncoderTest {
*/
@Nested
@DisplayName("encode(String, String)")
@Tag("encode")
@Tag("encoding")
class EncodeTests {
/**
@@ -658,7 +658,7 @@ class PatchCommandEncoderTest {
*/
@Nested
@DisplayName("reversed-word processing")
@Tag("reverse")
@Tag("normalization")
class ReversedWordProcessingTests {
/**

View File

@@ -75,6 +75,7 @@ import org.junit.jupiter.api.io.TempDir;
@DisplayName("StemmerDictionaryParser")
@Tag("unit")
@Tag("parser")
@Tag("stemmer")
class StemmerDictionaryParserTest {
/**

View File

@@ -54,9 +54,9 @@ import org.junit.jupiter.api.io.TempDir;
/**
* Tests for {@link StemmerKnowledgeExperiment}.
*/
@Tag("unit")
@Tag("integration")
@Tag("stemmer")
@Tag("trie")
final class StemmerKnowledgeExperimentTest {
/**

View File

@@ -38,6 +38,8 @@ import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyInt;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.mockStatic;
import static org.mockito.Mockito.verify;
@@ -91,6 +93,8 @@ import org.mockito.MockedStatic;
@Tag("unit")
@Tag("io")
@Tag("persistence")
@Tag("serialization")
@Tag("trie")
@DisplayName("StemmerPatchTrieBinaryIO")
class StemmerPatchTrieBinaryIOTest {
@@ -299,9 +303,19 @@ class StemmerPatchTrieBinaryIOTest {
"read(Path) must reject null path."),
() -> assertThrows(NullPointerException.class, () -> StemmerPatchTrieBinaryIO.read((String) null),
"read(String) must reject null file name."),
() -> assertThrows(NullPointerException.class,
() -> StemmerPatchTrieBinaryIO.read((Path) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
"read(Path, int) must reject null path."),
() -> assertThrows(NullPointerException.class,
() -> StemmerPatchTrieBinaryIO.read((String) null,
FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
"read(String, int) must reject null file name."),
() -> assertThrows(NullPointerException.class,
() -> StemmerPatchTrieBinaryIO.read((ByteArrayInputStream) null),
"read(InputStream) must reject null input stream."));
"read(InputStream) must reject null input stream."),
() -> assertThrows(NullPointerException.class,
() -> StemmerPatchTrieBinaryIO.read((ByteArrayInputStream) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
"read(InputStream, int) must reject null input stream."));
}
/**
@@ -385,6 +399,143 @@ class StemmerPatchTrieBinaryIOTest {
}
}
/**
* Verifies that stream overload with dense span override delegates to the
* four-argument readFrom method.
*/
@SuppressWarnings("unchecked")
@Test
@DisplayName("Should delegate stream read with dense span override")
void shouldDelegateInputStreamReadWithDenseSpanOverride() throws IOException {
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
final byte[] gzipPayload = gzip("binary-content-with-max-expanded-index");
try (@SuppressWarnings("rawtypes")
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO
.read(new ByteArrayInputStream(gzipPayload), 17);
assertSame(expectedTrie, actualTrie,
"read(InputStream, int) must return the trie produced by FrequencyTrie.readFrom(...).");
mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
any(FrequencyTrie.ValueStreamCodec.class), eq(17)));
}
}
/**
* Verifies that path overload with dense span override delegates to the
* same method overload with the override parameter.
*/
@SuppressWarnings("unchecked")
@Test
@DisplayName("Should delegate path read with dense span override")
void shouldDelegatePathReadWithDenseSpanOverride() throws IOException {
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
final Path sourceFile = temporaryDirectory.resolve("input-max-expanded.bin.gz");
Files.write(sourceFile, gzip("path-based-max-expanded-index"));
try (@SuppressWarnings("rawtypes")
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO.read(sourceFile, 0);
assertSame(expectedTrie, actualTrie,
"read(Path, int) must return the trie produced by FrequencyTrie.readFrom(...).");
mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
any(FrequencyTrie.ValueStreamCodec.class), eq(0)));
}
}
/**
* Verifies that string path overload with dense span override delegates to the
* same method overload with the override parameter.
*/
@SuppressWarnings("unchecked")
@Test
@DisplayName("Should delegate file name read with dense span override")
void shouldDelegateStringReadWithDenseSpanOverride() throws IOException {
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
final Path sourceFile = temporaryDirectory.resolve("input-string-max-expanded.bin.gz");
Files.write(sourceFile, gzip("string-based-max-expanded-index"));
try (@SuppressWarnings("rawtypes")
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO.read(sourceFile.toString(), 32);
assertSame(expectedTrie, actualTrie,
"read(String, int) must return the trie produced by FrequencyTrie.readFrom(...).");
mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
any(FrequencyTrie.ValueStreamCodec.class), eq(32)));
}
}
/**
* Verifies that metadata-only read parses and returns the persisted metadata.
*/
@Test
@DisplayName("Should read metadata from gzip payload")
void shouldReadMetadataFromGzipPayload() throws IOException {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new,
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
builder.put("run", PatchCommandEncoder.builder().build().encode("running", "run"));
final FrequencyTrie<String> trie = builder.build();
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
StemmerPatchTrieBinaryIO.write(trie, outputStream);
final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(new ByteArrayInputStream(outputStream.toByteArray()));
assertEquals(trie.metadata(), metadata,
"readMetadata(InputStream) must return the same metadata persisted by write().");
}
/**
* Verifies that metadata can be read from a binary file path.
*/
@Test
@DisplayName("Should read metadata from file path")
void shouldReadMetadataFromPath() throws IOException {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new,
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
builder.put("city", PatchCommandEncoder.builder().build().encode("cities", "city"));
final FrequencyTrie<String> trie = builder.build();
final Path sourceFile = temporaryDirectory.resolve("metadata-path.bin.gz");
StemmerPatchTrieBinaryIO.write(trie, sourceFile);
final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(sourceFile);
assertEquals(trie.metadata(), metadata);
}
/**
* Verifies that metadata can be read from a binary file name.
*/
@Test
@DisplayName("Should read metadata from file name")
void shouldReadMetadataFromStringPath() throws IOException {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new,
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
builder.put("city", PatchCommandEncoder.builder().build().encode("cities", "city"));
final FrequencyTrie<String> trie = builder.build();
final Path sourceFile = temporaryDirectory.resolve("metadata-string.bin.gz");
StemmerPatchTrieBinaryIO.write(trie, sourceFile);
final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(sourceFile.toString());
assertEquals(trie.metadata(), metadata);
}
/**
* Verifies that malformed non-GZip input is reported as an I/O failure.
*/

View File

@@ -85,9 +85,10 @@ import org.junit.jupiter.params.provider.MethodSource;
* <li>the current bundled language set, including right-to-left metadata</li>
* </ul>
*/
@Tag("unit")
@Tag("integration")
@Tag("stemmer")
@Tag("io")
@Tag("parser")
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
final class StemmerPatchTrieLoaderTest {
@@ -210,36 +211,43 @@ final class StemmerPatchTrieLoaderTest {
Arguments.of("14-load-binary-string",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null),
StemmerPatchTrieLoader.FILENAME_REQUIRED),
Arguments.of("15-load-binary-stream",
Arguments.of("15-load-binary-path-override",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((Path) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
"path"),
Arguments.of("16-load-binary-string-override",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null,
FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
StemmerPatchTrieLoader.FILENAME_REQUIRED),
Arguments.of("17-load-binary-stream",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((InputStream) null),
"inputStream"),
Arguments.of("16-save-binary-null-trie-path",
Arguments.of("18-save-binary-null-trie-path",
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath()), "trie"),
Arguments.of("17-save-binary-null-path",
Arguments.of("19-save-binary-null-path",
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (Path) null), "path"),
Arguments.of("18-save-binary-null-trie-string",
Arguments.of("20-save-binary-null-trie-string",
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath().toString()),
"trie"),
Arguments.of("19-save-binary-null-string",
Arguments.of("21-save-binary-null-string",
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
StemmerPatchTrieLoader.FILENAME_REQUIRED),
Arguments.of("20-load-language-null-metadata",
Arguments.of("22-load-language-null-metadata",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
true, (TrieMetadata) null),
"metadata"),
Arguments.of("21-load-path-null-metadata",
Arguments.of("23-load-path-null-metadata",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (TrieMetadata) null),
"metadata"),
Arguments.of("22-load-string-null-metadata",
Arguments.of("24-load-string-null-metadata",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
(TrieMetadata) null),
"metadata"),
Arguments.of("23-load-binary-metadata-path-null",
Arguments.of("25-load-binary-metadata-path-null",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((Path) null), "path"),
Arguments.of("24-load-binary-metadata-string-null",
Arguments.of("26-load-binary-metadata-string-null",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((String) null),
StemmerPatchTrieLoader.FILENAME_REQUIRED),
Arguments.of("25-load-binary-metadata-stream-null",
Arguments.of("27-load-binary-metadata-stream-null",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((InputStream) null),
"inputStream"));
}
@@ -512,6 +520,44 @@ final class StemmerPatchTrieLoaderTest {
}
}
/**
* Verifies that binary load overloads with an explicit dense lookup span
* preserve trie semantics while honoring the dense-layout override.
*/
@Test
@DisplayName("Binary dense-span override overloads should load equivalent tries")
void shouldLoadBinaryWithDenseSpanOverrideOverloads() throws IOException {
final Path dictionaryFile = writeDictionary("""
run running runs runner
city cities
study studies studying
""");
final Path binaryFile = tempDir.resolve("stemmer-trie-overrides.bin.gz");
final FrequencyTrie<String> original = StemmerPatchTrieLoader.load(dictionaryFile, true,
DEFAULT_REDUCTION_MODE);
StemmerPatchTrieLoader.saveBinary(original, binaryFile);
final FrequencyTrie<String> fromPathDefault = StemmerPatchTrieLoader.loadBinary(binaryFile);
final FrequencyTrie<String> fromPathDefaultByNegative = StemmerPatchTrieLoader.loadBinary(binaryFile,
FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX);
final FrequencyTrie<String> fromPathNoDense = StemmerPatchTrieLoader.loadBinary(binaryFile, 0);
final FrequencyTrie<String> fromStringNoDense = StemmerPatchTrieLoader.loadBinary(binaryFile.toString(), 0);
assertTriePatchSemanticsEqual(original, fromPathDefault, "run", "running", "runner", "cities", "studying");
assertTriePatchSemanticsEqual(original, fromPathDefaultByNegative, "run", "running", "runner", "cities",
"studying");
assertTriePatchSemanticsEqual(original, fromPathNoDense, "run", "running", "runner", "cities", "studying");
assertTriePatchSemanticsEqual(original, fromStringNoDense, "run", "running", "runner", "cities",
"studying");
assertFalse(fromPathNoDense.root().hasDenseLookup(),
"Zero span should disable dense lookup on the loaded root.");
assertFalse(fromStringNoDense.root().hasDenseLookup(),
"Zero span should disable dense lookup on the loaded root.");
}
/**
* Writes a dictionary file into the temporary directory.
*
@@ -530,6 +576,7 @@ final class StemmerPatchTrieLoaderTest {
* Bundled dictionary integration tests.
*/
@Nested
@Tag("slow")
@DisplayName("Bundled dictionaries")
final class BundledDictionaryTests {

View File

@@ -44,7 +44,7 @@ import java.util.Set;
import net.jqwik.api.ForAll;
import net.jqwik.api.Label;
import net.jqwik.api.Property;
import net.jqwik.api.Tag;
import org.junit.jupiter.api.Tag;
/**
* Property-based tests for patch-command stemmer tries.
@@ -56,9 +56,8 @@ import net.jqwik.api.Tag;
* persistence must not alter that behavior.
*/
@Label("Stemmer patch trie properties")
@Tag("unit")
@Tag("property")
@Tag("stemming")
@Tag("stemmer")
class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
/**

View File

@@ -40,6 +40,8 @@ import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
@Tag("unit")
@Tag("metadata")
@Tag("trie")
@DisplayName("TrieMetadata")
class TrieMetadataTest {

View File

@@ -40,6 +40,8 @@ import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
@Tag("unit")
@Tag("core")
@Tag("stemmer")
@DisplayName("WordTraversalDirection")
class WordTraversalDirectionTest {

View File

@@ -45,7 +45,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link ChildDescriptor}.
*/
@Tag("unit")
@Tag("fast")
@Tag("trie")
@DisplayName("ChildDescriptor")
class ChildDescriptorTest {

View File

@@ -31,8 +31,10 @@
package org.egothor.stemmer.trie;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
@@ -43,7 +45,6 @@ import org.junit.jupiter.api.Test;
* documented backing-array exposure.
*/
@Tag("unit")
@Tag("fast")
@Tag("trie")
@DisplayName("CompiledNode and NodeData")
class CompiledNodeAndNodeDataTest {
@@ -141,4 +142,136 @@ class CompiledNodeAndNodeDataTest {
assertSame(orderedValues, node.orderedValues());
assertSame(orderedCounts, node.orderedCounts());
}
/**
* Verifies that dense lookup is used when the interval is compact.
*/
@Test
@DisplayName("CompiledNode can resolve child via dense lookup table")
void compiledNodeUsesDenseLookupForCompactIntervals() {
@SuppressWarnings("unchecked")
final CompiledNode<String>[] children = new CompiledNode[4];
children[0] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
children[1] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
children[2] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
children[3] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
final CompiledNode<String> node = new CompiledNode<>(new char[] { 'a', 'b', 'c', 'd' }, children,
new String[] { "1", "2", "3", "4" }, new int[] { 1, 1, 1, 1 });
assertTrue(node.hasDenseLookup());
assertSame(children[0], node.findChild('a'));
assertSame(children[3], node.findChild('d'));
assertSame(null, node.findChild('z'));
}
/**
* Verifies that fallback linear scan is used for small node degree.
*/
@Test
@DisplayName("CompiledNode resolves child by linear scan for small degree")
void compiledNodeUsesLinearScanForSmallDegree() {
@SuppressWarnings("unchecked")
final CompiledNode<String>[] children = new CompiledNode[4];
final CompiledNode<String> childA = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
final CompiledNode<String> childB = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
final CompiledNode<String> childC = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
final CompiledNode<String> childD = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
children[0] = childA;
children[1] = childB;
children[2] = childC;
children[3] = childD;
final CompiledNode<String> node = new CompiledNode<>(new char[] { 'a', 'z', '中', '你' }, children,
new String[] { "1", "2", "3", "4" }, 0, new int[] { 1, 1, 1, 1 });
assertFalse(node.hasDenseLookup());
assertSame(childA, node.findChild('a'));
assertSame(childD, node.findChild('你'));
assertSame(null, node.findChild('b'));
}
/**
* Verifies that fallback binary search is used for larger node degree without
* dense lookup.
*/
@Test
@DisplayName("CompiledNode resolves child by binary search for large degree")
void compiledNodeUsesBinarySearchForLargeDegree() {
@SuppressWarnings("unchecked")
final CompiledNode<String>[] children = new CompiledNode[5];
final CompiledNode<String> childA = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
final CompiledNode<String> childB = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
final CompiledNode<String> childC = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
final CompiledNode<String> childD = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
final CompiledNode<String> childE = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
children[0] = childA;
children[1] = childB;
children[2] = childC;
children[3] = childD;
children[4] = childE;
final CompiledNode<String> node = new CompiledNode<>(new char[] { 'a', 'c', 'k', 't', 'z' }, children,
new String[] { "1", "2", "3", "4", "5" }, 0, new int[] { 1, 1, 1, 1, 1 });
assertFalse(node.hasDenseLookup());
assertSame(childC, node.findChild('k'));
assertSame(childE, node.findChild('z'));
assertSame(null, node.findChild('x'));
}
/**
* Verifies the basic node-state helpers that are used by diagnostics and
* behavioral checks.
*/
@Test
@DisplayName("CompiledNode reports leaf, value and edge presence state")
void compiledNodeReportsNodeStateHelpers() {
@SuppressWarnings("unchecked")
final CompiledNode<String>[] childless = new CompiledNode[0];
final CompiledNode<String> leaf = new CompiledNode<>(new char[0], childless, new String[0], new int[0]);
assertTrue(leaf.isLeaf());
assertFalse(leaf.hasChildren());
assertFalse(leaf.hasValues());
assertFalse(leaf.hasEdge('a'));
@SuppressWarnings("unchecked")
final CompiledNode<String>[] child = new CompiledNode[1];
final String[] orderedValues = new String[] { "leaf" };
final int[] orderedCounts = new int[] { 1 };
child[0] = new CompiledNode<>(new char[0], new CompiledNode[0], orderedValues, orderedCounts);
final CompiledNode<String> node = new CompiledNode<>(new char[] { 'a' }, child, orderedValues, orderedCounts);
assertFalse(node.isLeaf());
assertTrue(node.hasChildren());
assertTrue(node.hasValues());
assertTrue(node.valueCount() > 0);
assertTrue(node.hasEdge('a'));
assertFalse(node.hasEdge('b'));
}
/**
* Verifies structural equality and hash-code behavior for compiled nodes.
*/
@Test
@DisplayName("CompiledNode equals and hashCode align for identical structure")
void compiledNodeEqualsAndHashCodeAlignForIdenticalStructure() {
@SuppressWarnings("unchecked")
final CompiledNode<String>[] child = new CompiledNode[1];
final CompiledNode<String> leaf = new CompiledNode<>(new char[0], new CompiledNode[0], new String[] { "v" },
new int[] { 1 });
child[0] = leaf;
final CompiledNode<String> first = new CompiledNode<>(new char[] { 'a' }, child, new String[] { "x" },
new int[] { 2 });
final CompiledNode<String> second = new CompiledNode<>(new char[] { 'a' }, child, new String[] { "x" },
new int[] { 2 });
assertEquals(first, second);
assertEquals(first.hashCode(), second.hashCode());
}
}

View File

@@ -41,7 +41,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link DominantLocalDescriptor}.
*/
@Tag("unit")
@Tag("fast")
@Tag("trie")
@DisplayName("DominantLocalDescriptor")
class DominantLocalDescriptorTest {

View File

@@ -50,7 +50,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link LocalValueSummary}.
*/
@Tag("unit")
@Tag("fast")
@Tag("trie")
@DisplayName("LocalValueSummary")
class LocalValueSummaryTest {

View File

@@ -44,7 +44,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link MutableNode}.
*/
@Tag("unit")
@Tag("fast")
@Tag("trie")
@DisplayName("MutableNode")
class MutableNodeTest {

View File

@@ -41,7 +41,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link RankedLocalDescriptor}.
*/
@Tag("unit")
@Tag("fast")
@Tag("trie")
@DisplayName("RankedLocalDescriptor")
class RankedLocalDescriptorTest {

View File

@@ -48,7 +48,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link ReducedNode}.
*/
@Tag("unit")
@Tag("fast")
@Tag("trie")
@DisplayName("ReducedNode")
class ReducedNodeTest {

View File

@@ -47,7 +47,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link ReductionContext}.
*/
@Tag("unit")
@Tag("fast")
@Tag("trie")
@DisplayName("ReductionContext")
class ReductionContextTest {

View File

@@ -46,7 +46,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link ReductionSignature}.
*/
@Tag("unit")
@Tag("fast")
@Tag("trie")
@DisplayName("ReductionSignature")
class ReductionSignatureTest {

View File

@@ -41,7 +41,7 @@ import org.junit.jupiter.api.Test;
* Unit tests for {@link UnorderedLocalDescriptor}.
*/
@Tag("unit")
@Tag("fast")
@Tag("trie")
@DisplayName("UnorderedLocalDescriptor")
class UnorderedLocalDescriptorTest {