feat: implement dense-child optimized trie lookup and enterprise test/CI profile hardening
This commit is contained in:
12
.github/workflows/build.yml
vendored
12
.github/workflows/build.yml
vendored
@@ -51,7 +51,7 @@ jobs:
|
|||||||
test -f gradle/verification-metadata.xml
|
test -f gradle/verification-metadata.xml
|
||||||
|
|
||||||
- name: Execute build, tests, PMD, coverage, Javadoc, distribution packaging, and SBOM generation
|
- name: Execute build, tests, PMD, coverage, Javadoc, distribution packaging, and SBOM generation
|
||||||
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport distZip cyclonedxBom
|
run: ./gradlew --no-daemon clean ciRelease distZip pmdMain javadoc jacocoCiReleaseReport cyclonedxBom
|
||||||
|
|
||||||
- name: Upload SBOM
|
- name: Upload SBOM
|
||||||
if: always()
|
if: always()
|
||||||
@@ -70,8 +70,8 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
name: test-reports
|
name: test-reports
|
||||||
path: |
|
path: |
|
||||||
build/reports/tests/test
|
build/reports/tests
|
||||||
build/test-results/test
|
build/test-results
|
||||||
if-no-files-found: warn
|
if-no-files-found: warn
|
||||||
retention-days: 14
|
retention-days: 14
|
||||||
|
|
||||||
@@ -90,8 +90,8 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
name: coverage-reports
|
name: coverage-reports
|
||||||
path: |
|
path: |
|
||||||
build/reports/jacoco/test/html
|
build/reports/jacoco/jacocoCiReleaseReport/html
|
||||||
build/reports/jacoco/test/jacocoTestReport.xml
|
build/reports/jacoco/jacocoCiReleaseReport/jacocoCiReleaseReport.xml
|
||||||
if-no-files-found: warn
|
if-no-files-found: warn
|
||||||
retention-days: 14
|
retention-days: 14
|
||||||
|
|
||||||
@@ -160,7 +160,7 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
SIGNING_KEY: ${{ secrets.SIGNING_KEY }}
|
SIGNING_KEY: ${{ secrets.SIGNING_KEY }}
|
||||||
SIGNING_PASSWORD: ${{ secrets.SIGNING_PASSWORD }}
|
SIGNING_PASSWORD: ${{ secrets.SIGNING_PASSWORD }}
|
||||||
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport cyclonedxBom centralBundle
|
run: ./gradlew --no-daemon clean ciRelease distZip pmdMain javadoc jacocoCiReleaseReport cyclonedxBom centralBundle
|
||||||
|
|
||||||
- name: Generate release changelog
|
- name: Generate release changelog
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|||||||
19
.github/workflows/pages.yml
vendored
19
.github/workflows/pages.yml
vendored
@@ -70,7 +70,7 @@ jobs:
|
|||||||
test -f gradle/verification-metadata.xml
|
test -f gradle/verification-metadata.xml
|
||||||
|
|
||||||
- name: Build reports for publication
|
- name: Build reports for publication
|
||||||
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport pitest jmh cyclonedxBom
|
run: ./gradlew --no-daemon clean ciRelease pmdMain javadoc jacocoCiReleaseReport pitest jmh cyclonedxBom
|
||||||
|
|
||||||
- name: Prepare gh-pages worktree
|
- name: Prepare gh-pages worktree
|
||||||
shell: bash
|
shell: bash
|
||||||
@@ -93,6 +93,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
TEST_REPORT_DIR="build/reports/tests/ciRelease"
|
||||||
|
JACOCO_REPORT_DIR="build/reports/jacoco/jacocoCiReleaseReport"
|
||||||
|
|
||||||
SITE_DIR=".gh-pages"
|
SITE_DIR=".gh-pages"
|
||||||
RUN_DIR="${SITE_DIR}/builds/${GITHUB_RUN_NUMBER}"
|
RUN_DIR="${SITE_DIR}/builds/${GITHUB_RUN_NUMBER}"
|
||||||
RUN_METRICS_DIR="${RUN_DIR}/metrics"
|
RUN_METRICS_DIR="${RUN_DIR}/metrics"
|
||||||
@@ -106,14 +109,14 @@ jobs:
|
|||||||
cp -R build/docs/javadoc "${RUN_DIR}/javadoc"
|
cp -R build/docs/javadoc "${RUN_DIR}/javadoc"
|
||||||
cp -R build/docs/javadoc "${LATEST_DIR}/javadoc"
|
cp -R build/docs/javadoc "${LATEST_DIR}/javadoc"
|
||||||
|
|
||||||
cp -R build/reports/tests/test "${RUN_DIR}/test"
|
cp -R "${TEST_REPORT_DIR}" "${RUN_DIR}/test"
|
||||||
cp -R build/reports/tests/test "${LATEST_DIR}/test"
|
cp -R "${TEST_REPORT_DIR}" "${LATEST_DIR}/test"
|
||||||
|
|
||||||
cp -R build/reports/pmd "${RUN_DIR}/pmd"
|
cp -R build/reports/pmd "${RUN_DIR}/pmd"
|
||||||
cp -R build/reports/pmd "${LATEST_DIR}/pmd"
|
cp -R build/reports/pmd "${LATEST_DIR}/pmd"
|
||||||
|
|
||||||
cp -R build/reports/jacoco/test/html "${RUN_DIR}/coverage"
|
cp -R "${JACOCO_REPORT_DIR}/html" "${RUN_DIR}/coverage"
|
||||||
cp -R build/reports/jacoco/test/html "${LATEST_DIR}/coverage"
|
cp -R "${JACOCO_REPORT_DIR}/html" "${LATEST_DIR}/coverage"
|
||||||
|
|
||||||
cp -R build/reports/pitest "${RUN_DIR}/pitest"
|
cp -R build/reports/pitest "${RUN_DIR}/pitest"
|
||||||
cp -R build/reports/pitest "${LATEST_DIR}/pitest"
|
cp -R build/reports/pitest "${LATEST_DIR}/pitest"
|
||||||
@@ -178,7 +181,7 @@ jobs:
|
|||||||
|
|
||||||
python3 \
|
python3 \
|
||||||
./tools/generate-pages-badges.py \
|
./tools/generate-pages-badges.py \
|
||||||
--jacoco-xml build/reports/jacoco/test/jacocoTestReport.xml \
|
--jacoco-xml "${JACOCO_REPORT_DIR}/jacocoCiReleaseReport.xml" \
|
||||||
--pit-xml build/reports/pitest/mutations.xml \
|
--pit-xml build/reports/pitest/mutations.xml \
|
||||||
--jmh-csv build/reports/jmh/jmh-results.csv \
|
--jmh-csv build/reports/jmh/jmh-results.csv \
|
||||||
--run-metrics-dir "${RUN_METRICS_DIR}" \
|
--run-metrics-dir "${RUN_METRICS_DIR}" \
|
||||||
@@ -228,7 +231,7 @@ jobs:
|
|||||||
<p class="meta">Build ${GITHUB_RUN_NUMBER} from commit ${GITHUB_SHA}</p>
|
<p class="meta">Build ${GITHUB_RUN_NUMBER} from commit ${GITHUB_SHA}</p>
|
||||||
<ul>
|
<ul>
|
||||||
<li><a href="./javadoc/">Javadoc</a></li>
|
<li><a href="./javadoc/">Javadoc</a></li>
|
||||||
<li><a href="./test/">Test Report</a></li>
|
<li><a href="./test/">Release Verification Test Report (ciRelease)</a></li>
|
||||||
<li><a href="./pmd/main.html">PMD Report</a></li>
|
<li><a href="./pmd/main.html">PMD Report</a></li>
|
||||||
<li><a href="./coverage/">Coverage Report</a></li>
|
<li><a href="./coverage/">Coverage Report</a></li>
|
||||||
${DEPENDENCY_CHECK_LINK:-<li>Dependency Vulnerability Report: not available</li>}
|
${DEPENDENCY_CHECK_LINK:-<li>Dependency Vulnerability Report: not available</li>}
|
||||||
@@ -260,7 +263,7 @@ jobs:
|
|||||||
|
|
||||||
- [Latest build summary](https://leogalambos.github.io/Radixor/builds/latest/)
|
- [Latest build summary](https://leogalambos.github.io/Radixor/builds/latest/)
|
||||||
- [Javadoc](https://leogalambos.github.io/Radixor/builds/latest/javadoc/)
|
- [Javadoc](https://leogalambos.github.io/Radixor/builds/latest/javadoc/)
|
||||||
- [Unit test report](https://leogalambos.github.io/Radixor/builds/latest/test/)
|
- [Release verification test report (ciRelease)](https://leogalambos.github.io/Radixor/builds/latest/test/)
|
||||||
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
|
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
|
||||||
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
|
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
|
||||||
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
|
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
|
||||||
|
|||||||
@@ -167,6 +167,9 @@ The repository keeps the front page concise and places detailed documentation un
|
|||||||
- [Architecture](docs/architecture.md)
|
- [Architecture](docs/architecture.md)
|
||||||
Structural model, data flow, and runtime lookup behavior.
|
Structural model, data flow, and runtime lookup behavior.
|
||||||
|
|
||||||
|
- [Lookup Edge Optimization](docs/lookup-edge-optimization.md)
|
||||||
|
Speed/memory trade-off of dense child edge lookup in compiled tries.
|
||||||
|
|
||||||
- [Reduction Semantics](docs/reduction-semantics.md)
|
- [Reduction Semantics](docs/reduction-semantics.md)
|
||||||
Ranked, unordered, and dominant reduction behavior.
|
Ranked, unordered, and dominant reduction behavior.
|
||||||
|
|
||||||
|
|||||||
161
build.gradle
161
build.gradle
@@ -108,9 +108,19 @@ dependencyCheck {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tasks.withType(Test).configureEach {
|
def cliIncludeTags = project.findProperty('includeTags')?.toString() ?: System.getProperty('includeTags')
|
||||||
useJUnitPlatform()
|
def cliExcludeTags = project.findProperty('excludeTags')?.toString() ?: System.getProperty('excludeTags')
|
||||||
|
|
||||||
|
def splitTagExpression = { String tagsExpr ->
|
||||||
|
if (tagsExpr == null || tagsExpr.isBlank()) {
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
return tagsExpr.split(',')
|
||||||
|
.collect { it.trim() }
|
||||||
|
.findAll { it != null && !it.isBlank() }
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.withType(Test).configureEach {
|
||||||
doFirst {
|
doFirst {
|
||||||
jvmArgs "-javaagent:${configurations.mockitoAgent.singleFile}"
|
jvmArgs "-javaagent:${configurations.mockitoAgent.singleFile}"
|
||||||
}
|
}
|
||||||
@@ -123,14 +133,127 @@ tasks.withType(Test).configureEach {
|
|||||||
minHeapSize = '1g'
|
minHeapSize = '1g'
|
||||||
maxHeapSize = '4g'
|
maxHeapSize = '4g'
|
||||||
|
|
||||||
finalizedBy(tasks.named('jacocoTestReport'))
|
|
||||||
|
|
||||||
reports {
|
reports {
|
||||||
junitXml.required = true
|
junitXml.required = true
|
||||||
html.required = true
|
html.required = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def configureJUnitPlatformTags = { Test task, String includeTagsExpr, String excludeTagsExpr ->
|
||||||
|
task.useJUnitPlatform {
|
||||||
|
final def includes = splitTagExpression(includeTagsExpr)
|
||||||
|
final def excludes = splitTagExpression(excludeTagsExpr)
|
||||||
|
|
||||||
|
if (!includes.isEmpty()) {
|
||||||
|
includeTags(*includes.toArray(new String[0]))
|
||||||
|
}
|
||||||
|
if (!excludes.isEmpty()) {
|
||||||
|
excludeTags(*excludes.toArray(new String[0]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.named('test', Test) {
|
||||||
|
configureJUnitPlatformTags(it, cliIncludeTags, cliExcludeTags)
|
||||||
|
finalizedBy(tasks.named('jacocoTestReport'))
|
||||||
|
}
|
||||||
|
|
||||||
|
def configureTaggedTestProfile = { String taskName, String includeTagsExpr, String excludeTagsExpr = null,
|
||||||
|
String taskDescription = null, String testNameExcludePatterns = null ->
|
||||||
|
tasks.register(taskName, Test) {
|
||||||
|
group = 'verification'
|
||||||
|
description = taskDescription
|
||||||
|
|
||||||
|
configureJUnitPlatformTags(delegate as Test, includeTagsExpr, excludeTagsExpr)
|
||||||
|
testClassesDirs = sourceSets.test.output.classesDirs
|
||||||
|
classpath = sourceSets.test.runtimeClasspath
|
||||||
|
dependsOn(tasks.named('compileTestJava'))
|
||||||
|
|
||||||
|
doFirst {
|
||||||
|
jvmArgs "-javaagent:${configurations.mockitoAgent.singleFile}"
|
||||||
|
}
|
||||||
|
|
||||||
|
if (testNameExcludePatterns != null && !testNameExcludePatterns.isBlank()) {
|
||||||
|
filter {
|
||||||
|
testNameExcludePatterns.split(',').each { String pattern ->
|
||||||
|
final def trimmedPattern = pattern.trim()
|
||||||
|
if (!trimmedPattern.isEmpty()) {
|
||||||
|
excludeTestsMatching(trimmedPattern)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
minHeapSize = '1g'
|
||||||
|
maxHeapSize = '4g'
|
||||||
|
|
||||||
|
reports {
|
||||||
|
junitXml.required = true
|
||||||
|
html.required = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
configureTaggedTestProfile(
|
||||||
|
'ciSmoke',
|
||||||
|
'unit',
|
||||||
|
'slow',
|
||||||
|
'Fast feedback profile for unit tests with slow tests explicitly excluded.',
|
||||||
|
'org.egothor.stemmer.CompileIntegrationTest*'
|
||||||
|
)
|
||||||
|
|
||||||
|
configureTaggedTestProfile(
|
||||||
|
'ciCore',
|
||||||
|
'unit,trie,frequency-trie,property',
|
||||||
|
null,
|
||||||
|
'Focused profile for core trie behavior and trie-specific property checks.'
|
||||||
|
)
|
||||||
|
|
||||||
|
configureTaggedTestProfile(
|
||||||
|
'ciIntegration',
|
||||||
|
'integration',
|
||||||
|
'slow',
|
||||||
|
'Integration pipeline profile (loader/parser/CLI/IO end-to-end flows) excluding slow integration paths.'
|
||||||
|
)
|
||||||
|
|
||||||
|
configureTaggedTestProfile(
|
||||||
|
'ciSlow',
|
||||||
|
'slow',
|
||||||
|
null,
|
||||||
|
'Targeted profile for all slow tests (large dictionaries, long-running corpus validation, and heavy integration checks).'
|
||||||
|
)
|
||||||
|
|
||||||
|
configureTaggedTestProfile(
|
||||||
|
'ciCompat',
|
||||||
|
'compat,regression',
|
||||||
|
null,
|
||||||
|
'Compatibility profile guarding persisted artifact and compatibility regressions.'
|
||||||
|
)
|
||||||
|
|
||||||
|
configureTaggedTestProfile(
|
||||||
|
'ciRelease',
|
||||||
|
null,
|
||||||
|
'slow',
|
||||||
|
'Release-profile validation of all non-slow tests.',
|
||||||
|
'org.egothor.stemmer.CompileIntegrationTest*,org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*'
|
||||||
|
)
|
||||||
|
|
||||||
|
configureTaggedTestProfile(
|
||||||
|
'ciNightly',
|
||||||
|
'fuzz',
|
||||||
|
null,
|
||||||
|
'Nightly robustness profile with fuzz testing emphasis.'
|
||||||
|
)
|
||||||
|
|
||||||
|
tasks.register('ci') {
|
||||||
|
group = 'verification'
|
||||||
|
description = 'Runs the full enterprise CI profile set in sequence.'
|
||||||
|
dependsOn(tasks.named('ciSmoke'))
|
||||||
|
dependsOn(tasks.named('ciCore'))
|
||||||
|
dependsOn(tasks.named('ciIntegration'))
|
||||||
|
dependsOn(tasks.named('ciCompat'))
|
||||||
|
}
|
||||||
|
|
||||||
tasks.withType(Pmd).configureEach {
|
tasks.withType(Pmd).configureEach {
|
||||||
reports {
|
reports {
|
||||||
xml.required = true
|
xml.required = true
|
||||||
@@ -155,6 +278,36 @@ tasks.named('jacocoTestReport', JacocoReport) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def registerJacocoProfileReport = { String reportTaskName, String sourceTaskName ->
|
||||||
|
tasks.register(reportTaskName, JacocoReport) {
|
||||||
|
group = 'verification'
|
||||||
|
description = "Generates Jacoco report for ${sourceTaskName} execution."
|
||||||
|
|
||||||
|
dependsOn(tasks.named(sourceTaskName))
|
||||||
|
|
||||||
|
classDirectories.setFrom(
|
||||||
|
files(sourceSets.main.output).asFileTree.matching {
|
||||||
|
exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*'
|
||||||
|
exclude 'org/egothor/stemmer/DiacriticStripper*'
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
executionData.setFrom(
|
||||||
|
fileTree(layout.buildDirectory.dir('jacoco')) {
|
||||||
|
include "${sourceTaskName}.exec"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
reports {
|
||||||
|
xml.required = true
|
||||||
|
csv.required = false
|
||||||
|
html.required = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
registerJacocoProfileReport('jacocoCiReleaseReport', 'ciRelease')
|
||||||
|
|
||||||
tasks.named('check') {
|
tasks.named('check') {
|
||||||
dependsOn(tasks.named('jacocoTestReport'))
|
dependsOn(tasks.named('jacocoTestReport'))
|
||||||
// no-default, only on-demand: dependsOn(tasks.named('dependencyCheckAnalyze'))
|
// no-default, only on-demand: dependsOn(tasks.named('dependencyCheckAnalyze'))
|
||||||
|
|||||||
193
docs/lookup-edge-optimization.md
Normal file
193
docs/lookup-edge-optimization.md
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
# Lookup Edge Optimization
|
||||||
|
|
||||||
|
Compiled trie nodes (`CompiledNode`) use three lookup strategies when resolving child edges:
|
||||||
|
|
||||||
|
1. dense array direct lookup,
|
||||||
|
2. linear scan for very small child counts,
|
||||||
|
3. binary search over sorted edge labels.
|
||||||
|
|
||||||
|
This page explains the dense path, what `maxExpandedIndex` controls, and how to tune it.
|
||||||
|
|
||||||
|
## Runtime model of one node
|
||||||
|
|
||||||
|
For a node with sorted edge labels `char[] edges`, the implementation can materialize an
|
||||||
|
index-aligned dense table when labels occupy a small compact code-point interval:
|
||||||
|
|
||||||
|
```text
|
||||||
|
span = maxEdge - minEdge
|
||||||
|
use dense table iff (span <= maxExpandedIndex) and (maxExpandedIndex > 0)
|
||||||
|
```
|
||||||
|
|
||||||
|
When dense lookup is used, lookup is constant-time indexing:
|
||||||
|
|
||||||
|
```text
|
||||||
|
denseIndex = requestedEdge - minEdge
|
||||||
|
return denseChildren[denseIndex] // or null if outside interval
|
||||||
|
```
|
||||||
|
|
||||||
|
When dense lookup is not active (interval is too wide or the configured
|
||||||
|
`maxExpandedIndex` is `0`), `CompiledNode` still chooses between two fallback
|
||||||
|
strategies:
|
||||||
|
|
||||||
|
- **linear scan** for very small child counts (`4` or fewer children),
|
||||||
|
- **binary search** for larger child counts.
|
||||||
|
|
||||||
|
This means the fallback method is selected by child count, not by “distance” alone.
|
||||||
|
`linear scan` is therefore used when there are only a few edges even if those edges are
|
||||||
|
spread across very distant code points.
|
||||||
|
|
||||||
|
### Example: few edges, wide Unicode span
|
||||||
|
|
||||||
|
```text
|
||||||
|
edges = ['a', '中', '你']
|
||||||
|
edge count = 3
|
||||||
|
minEdge = 'a' (U+0061)
|
||||||
|
maxEdge = '你' (U+4F60)
|
||||||
|
span = 20319
|
||||||
|
```
|
||||||
|
|
||||||
|
- If `maxExpandedIndex = 512`, dense indexing is not used because `span > maxExpandedIndex`.
|
||||||
|
- Because `edge count = 3` (<= 4), lookup falls back to a tiny linear scan of the
|
||||||
|
three labels.
|
||||||
|
- This is exactly the case where you get benefit from the threshold even though the interval is wide.
|
||||||
|
|
||||||
|
This is useful for non-Latin scripts as well: what matters is interval width in Unicode
|
||||||
|
code points, not script name. A compact Arabic-range block can still benefit from dense
|
||||||
|
lookups when keys stay in a tight code-point interval.
|
||||||
|
|
||||||
|
## Why this is configurable
|
||||||
|
|
||||||
|
`maxExpandedIndex` is only a performance/paging choice:
|
||||||
|
|
||||||
|
- higher value:
|
||||||
|
- more compact intervals qualify for dense tables,
|
||||||
|
- more constant-time child lookup,
|
||||||
|
- more memory for dense tables in qualifying nodes.
|
||||||
|
- lower value (or `0`):
|
||||||
|
- less dense-table allocation,
|
||||||
|
- fewer branches into constant-time path,
|
||||||
|
- lower materialization memory.
|
||||||
|
|
||||||
|
The value never changes lookup semantics. It only changes the in-memory structure shape.
|
||||||
|
|
||||||
|
## Persistence and loading model
|
||||||
|
|
||||||
|
This threshold is **not** stored in `TrieMetadata`.
|
||||||
|
|
||||||
|
- The binary format stores only trie payload and semantic metadata (`reduction`, `traversal`,
|
||||||
|
case/diacritic settings, and stream version).
|
||||||
|
- `maxExpandedIndex` is chosen when materializing nodes in memory.
|
||||||
|
- You can therefore keep one persisted artifact and load it with different in-memory
|
||||||
|
trade-offs depending on deployment constraints.
|
||||||
|
|
||||||
|
## Default
|
||||||
|
|
||||||
|
- `FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX == 512`
|
||||||
|
- `CompiledNode.DEFAULT_MAX_EXPANDED_INDEX == 512`
|
||||||
|
|
||||||
|
These are practical defaults for mixed-language text and Latin-like scripts where edge labels
|
||||||
|
often cluster.
|
||||||
|
|
||||||
|
## Tune during build (writable phase)
|
||||||
|
|
||||||
|
Use the full `FrequencyTrie.Builder` constructor when you are compiling from source data.
|
||||||
|
The builder threshold is applied while freezing reduced nodes into the immutable form.
|
||||||
|
|
||||||
|
```java
|
||||||
|
import org.egothor.stemmer.CaseProcessingMode;
|
||||||
|
import org.egothor.stemmer.DiacriticProcessingMode;
|
||||||
|
import org.egothor.stemmer.FrequencyTrie;
|
||||||
|
import org.egothor.stemmer.ReductionMode;
|
||||||
|
import org.egothor.stemmer.ReductionSettings;
|
||||||
|
import org.egothor.stemmer.WordTraversalDirection;
|
||||||
|
|
||||||
|
final ReductionSettings settings = ReductionSettings.withDefaults(
|
||||||
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||||
|
|
||||||
|
final FrequencyTrie.Builder<String> fastBuilder =
|
||||||
|
new FrequencyTrie.Builder<>(String[]::new,
|
||||||
|
settings,
|
||||||
|
WordTraversalDirection.BACKWARD,
|
||||||
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
|
||||||
|
DiacriticProcessingMode.AS_IS,
|
||||||
|
1024); // prefer lookup speed
|
||||||
|
|
||||||
|
// ... put(...) ...
|
||||||
|
final FrequencyTrie<String> trie = fastBuilder.build();
|
||||||
|
```
|
||||||
|
|
||||||
|
Use `0` or `256` for lower memory while still building larger tries.
|
||||||
|
|
||||||
|
```java
|
||||||
|
final FrequencyTrie.Builder<String> compactBuilder =
|
||||||
|
new FrequencyTrie.Builder<>(String[]::new,
|
||||||
|
settings,
|
||||||
|
WordTraversalDirection.BACKWARD,
|
||||||
|
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
|
||||||
|
DiacriticProcessingMode.AS_IS,
|
||||||
|
256); // lower memory profile
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tune when loading a binary artifact (runtime phase)
|
||||||
|
|
||||||
|
At artifact load time, you can tune the same trade-off independently of persisted metadata.
|
||||||
|
|
||||||
|
```java
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||||
|
|
||||||
|
var defaultLookup = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"));
|
||||||
|
|
||||||
|
var fastLookup = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"), 1024);
|
||||||
|
|
||||||
|
var compactLookup = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"), 0);
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also set the threshold directly with `FrequencyTrie.readFrom(...)` when reading streams:
|
||||||
|
|
||||||
|
```java
|
||||||
|
import java.io.DataInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
|
import org.egothor.stemmer.FrequencyTrie;
|
||||||
|
|
||||||
|
public final class StreamLoadExample {
|
||||||
|
|
||||||
|
private StreamLoadExample() {
|
||||||
|
throw new AssertionError("No instances.");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(final String[] arguments) throws IOException {
|
||||||
|
try (InputStream fileInput = Files.newInputStream(Path.of("stemmers", "english.radixor.gz"));
|
||||||
|
GZIPInputStream gzip = new GZIPInputStream(fileInput);
|
||||||
|
DataInputStream dataInput = new DataInputStream(gzip)) {
|
||||||
|
final FrequencyTrie<String> compactOnLoad = FrequencyTrie.readFrom(
|
||||||
|
dataInput,
|
||||||
|
String[]::new,
|
||||||
|
input -> input.readUTF(),
|
||||||
|
256);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: the string codec is intentionally inline in this snippet to keep it self-contained.
|
||||||
|
|
||||||
|
## Practical guidance
|
||||||
|
|
||||||
|
- Start with default (`512`) in production and profile before changing it.
|
||||||
|
- Use `0` when memory is the priority and query throughput is not the bottleneck.
|
||||||
|
- Use values around `1024` for workloads dominated by compact alphabets and very hot lookups.
|
||||||
|
|
||||||
|
Trade-off expectation:
|
||||||
|
|
||||||
|
- increasing `maxExpandedIndex` improves lookup speed when edges tend to occupy short spans,
|
||||||
|
- decreasing it reduces per-node auxiliary memory in dense-span nodes.
|
||||||
@@ -87,6 +87,43 @@ public final class LoadBinaryExample {
|
|||||||
|
|
||||||
The binary format is the native `FrequencyTrie` serialization wrapped in GZip compression. It includes persisted `TrieMetadata`, so lookup after loading uses the traversal, case-processing, diacritic-processing, and reduction settings captured when the trie was compiled.
|
The binary format is the native `FrequencyTrie` serialization wrapped in GZip compression. It includes persisted `TrieMetadata`, so lookup after loading uses the traversal, case-processing, diacritic-processing, and reduction settings captured when the trie was compiled.
|
||||||
|
|
||||||
|
## Tune child lookup density when loading binaries
|
||||||
|
|
||||||
|
To optimize hot-path latency, you can tune direct child indexing by passing `maxExpandedIndex`
|
||||||
|
at load time. This does not change persisted metadata, only the materialized in-memory form.
|
||||||
|
|
||||||
|
```java
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import org.egothor.stemmer.FrequencyTrie;
|
||||||
|
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||||
|
|
||||||
|
public final class LoadBinaryWithDenseLookupExample {
|
||||||
|
|
||||||
|
private LoadBinaryWithDenseLookupExample() {
|
||||||
|
throw new AssertionError("No instances.");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(final String[] arguments) throws IOException {
|
||||||
|
final FrequencyTrie<String> balanced = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"));
|
||||||
|
|
||||||
|
final FrequencyTrie<String> fast = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"),
|
||||||
|
1024);
|
||||||
|
|
||||||
|
final FrequencyTrie<String> compact = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"),
|
||||||
|
0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Negative values still use `FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX`.
|
||||||
|
|
||||||
|
[Lookup Edge Optimization](lookup-edge-optimization.md) describes the trade-off in detail and examples for build-time tuning as well.
|
||||||
|
|
||||||
## Build directly with a mutable builder
|
## Build directly with a mutable builder
|
||||||
|
|
||||||
A `FrequencyTrie.Builder<V>` accepts repeated `put(key, value)` calls and compiles the final read-only trie through `build()`. Compilation performs bottom-up reduction and produces the compact immutable runtime representation.
|
A `FrequencyTrie.Builder<V>` accepts repeated `put(key, value)` calls and compiles the final read-only trie through `build()`. Compilation performs bottom-up reduction and produces the compact immutable runtime representation.
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ This is why Radixor can generalize beyond explicitly listed forms and why compil
|
|||||||
The programmatic API is easier to understand when split by developer task:
|
The programmatic API is easier to understand when split by developer task:
|
||||||
|
|
||||||
- [Loading and Building Stemmers](programmatic-loading-and-building.md) explains how to acquire a compiled stemmer from bundled resources, textual dictionaries, binary artifacts, or direct builder usage.
|
- [Loading and Building Stemmers](programmatic-loading-and-building.md) explains how to acquire a compiled stemmer from bundled resources, textual dictionaries, binary artifacts, or direct builder usage.
|
||||||
|
- [Lookup Edge Optimization](lookup-edge-optimization.md) explains dense child lookup tuning and the speed/memory trade-off when materializing compiled tries.
|
||||||
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md) explains `get(...)`, `getAll(...)`, `getEntries(...)`, patch application, and the practical meaning of reduction modes.
|
- [Querying and Ambiguity Handling](programmatic-querying-and-ambiguity.md) explains `get(...)`, `getAll(...)`, `getEntries(...)`, patch application, and the practical meaning of reduction modes.
|
||||||
- [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md) explains how to reopen compiled tries, add new lexical data, rebuild them, and store them as binary artifacts.
|
- [Extending and Persisting Compiled Tries](programmatic-extending-and-persistence.md) explains how to reopen compiled tries, add new lexical data, rebuild them, and store them as binary artifacts.
|
||||||
|
|
||||||
|
|||||||
@@ -58,6 +58,27 @@ A deterministic system is easier to test, easier to reason about, and safer to i
|
|||||||
|
|
||||||
The project is intended to maintain very high confidence in both core correctness and behavioral stability.
|
The project is intended to maintain very high confidence in both core correctness and behavioral stability.
|
||||||
|
|
||||||
|
The recommended execution strategy is defined by the tagged test profiles in [Test taxonomy and execution filtering](test-taxonomy-and-filtering.md). In practice, teams can execute profile tasks directly:
|
||||||
|
|
||||||
|
- `./gradlew ciSmoke`: fast local/PR safety checks (`unit`, excluding `slow`; additionally excludes
|
||||||
|
`CompileIntegrationTest` as a defensive safeguard).
|
||||||
|
- `./gradlew ciSlow`: enterprise heavy gate for all tests marked with `slow` (typically
|
||||||
|
production dictionary and large corpus verification). This should be used for scheduled/manual
|
||||||
|
hardening gates and not in standard release build.
|
||||||
|
- `./gradlew ciCore`: behavioral coverage of trie and frequency-trie paths (`unit` + `property` where applicable)
|
||||||
|
- `./gradlew ciIntegration`: pipeline and CLI integration path checks
|
||||||
|
- `./gradlew ciCompat`: compatibility and regression verification for persisted artifacts
|
||||||
|
- `./gradlew ciRelease`: full non-slow suite for release-confidence runs (all test tags except `slow`,
|
||||||
|
plus explicit name-based exclusion of `CompileIntegrationTest*` and
|
||||||
|
`StemmerPatchTrieLoaderTest$BundledDictionaryTests*` as additional guardrails)
|
||||||
|
- `./gradlew ciNightly`: extended fuzz profile for robustness hardening
|
||||||
|
- `./gradlew ci`: umbrella profile depending on smoke/core/integration/compat
|
||||||
|
|
||||||
|
## Test taxonomy and execution filtering
|
||||||
|
|
||||||
|
The full tag taxonomy and executable filter examples are documented in
|
||||||
|
[Test taxonomy and execution filtering](test-taxonomy-and-filtering.md).
|
||||||
|
|
||||||
### Structural coverage
|
### Structural coverage
|
||||||
|
|
||||||
High code coverage is treated as a useful signal, but not as a sufficient goal on its own. Coverage is valuable only when the covered scenarios actually pressure the implementation in meaningful ways.
|
High code coverage is treated as a useful signal, but not as a sufficient goal on its own. Coverage is valuable only when the covered scenarios actually pressure the implementation in meaningful ways.
|
||||||
|
|||||||
@@ -67,6 +67,36 @@ public final class LoadBinaryStemmerExample {
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can tune in-memory child lookup density at load time without changing the artifact:
|
||||||
|
|
||||||
|
```java
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import org.egothor.stemmer.FrequencyTrie;
|
||||||
|
import org.egothor.stemmer.StemmerPatchTrieLoader;
|
||||||
|
|
||||||
|
public final class LoadBinaryStemmerExampleTuned {
|
||||||
|
|
||||||
|
private LoadBinaryStemmerExampleTuned() {
|
||||||
|
throw new AssertionError("No instances.");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(final String[] arguments) throws IOException {
|
||||||
|
final FrequencyTrie<String> fast = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"),
|
||||||
|
1024);
|
||||||
|
final FrequencyTrie<String> compact = StemmerPatchTrieLoader.loadBinary(
|
||||||
|
Path.of("stemmers", "english.radixor.gz"),
|
||||||
|
128);
|
||||||
|
|
||||||
|
System.out.println("fast=" + fast.size() + ", compact=" + compact.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
For the trade-off details, see [Lookup Edge Optimization](lookup-edge-optimization.md).
|
||||||
|
|
||||||
### Build or extend a stemmer from dictionary data
|
### Build or extend a stemmer from dictionary data
|
||||||
|
|
||||||
Radixor can also build a compiled trie from a custom dictionary. Dictionary lines consist of a canonical stem followed by zero or more variants. The input may be plain UTF-8 text or GZip-compressed UTF-8 text when loaded from a filesystem path. The parser applies `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), ignores leading and trailing whitespace around columns, supports line remarks introduced by `#` or `//`, and skips dictionary items that contain embedded whitespace.
|
Radixor can also build a compiled trie from a custom dictionary. Dictionary lines consist of a canonical stem followed by zero or more variants. The input may be plain UTF-8 text or GZip-compressed UTF-8 text when loaded from a filesystem path. The parser applies `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), ignores leading and trailing whitespace around columns, supports line remarks introduced by `#` or `//`, and skips dictionary items that contain embedded whitespace.
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ These reports are primarily useful when reviewing the published API surface and
|
|||||||
|
|
||||||
These reports describe the outcome of core verification and static-analysis stages for the latest published build:
|
These reports describe the outcome of core verification and static-analysis stages for the latest published build:
|
||||||
|
|
||||||
- [Unit test report](https://leogalambos.github.io/Radixor/builds/latest/test/)
|
- [Release verification test report (ciRelease)](https://leogalambos.github.io/Radixor/builds/latest/test/)
|
||||||
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
|
- [PMD report](https://leogalambos.github.io/Radixor/builds/latest/pmd/main.html)
|
||||||
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
|
- [JaCoCo coverage report](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
|
||||||
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
|
- [PIT mutation testing report](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
|
||||||
|
|||||||
216
docs/test-taxonomy-and-filtering.md
Normal file
216
docs/test-taxonomy-and-filtering.md
Normal file
@@ -0,0 +1,216 @@
|
|||||||
|
# Test Tag Taxonomy and Execution Guide
|
||||||
|
|
||||||
|
Radixor uses JUnit tags as an explicit execution policy for its test suite.
|
||||||
|
|
||||||
|
The project uses three orthogonal axes:
|
||||||
|
|
||||||
|
1. **Scope** (how the test is executed in the pipeline)
|
||||||
|
2. **Domain** (where in the system it belongs)
|
||||||
|
3. **Intent** (what behavior it verifies)
|
||||||
|
|
||||||
|
## Canonical scope tags
|
||||||
|
|
||||||
|
| Tag | Description | Typical usage |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `unit` | Fast, deterministic tests that exercise a specific class or behavior without external processes. | Default developer feedback; should stay near-zero flakiness and low run time. |
|
||||||
|
| `integration` | Tests that span multiple components or end-to-end flows of the public pipeline. | Parser/loader/CLI/IO integration checks and multi-step compile-then-load validations. |
|
||||||
|
| `property` | Property-based tests with generator-driven coverage for invariants. | Semantics-preserving laws and edge-case exploration beyond curated fixtures. |
|
||||||
|
| `fuzz` | Randomized stress checks with bounded runtime. | Heavier probabilistic verification of robustness and reduction invariants. |
|
||||||
|
| `compat` | Backward/forward compatibility and reproducibility checks for persisted artifacts. | Artifact fingerprints, deterministic rebuild, and regression fixtures. |
|
||||||
|
| `slow` | Long-running or expensive tests that should not execute in every fast gate. | Heavy fuzz/property budgets or high-duration integration checks. |
|
||||||
|
|
||||||
|
## Canonical domain tags
|
||||||
|
|
||||||
|
| Tag | Description | Typical usage |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `core` | Core algorithm and foundational platform behavior. | Traversal direction, base data structures, low-level helpers. |
|
||||||
|
| `trie` | All mutable/compiled trie behaviors and traversal internals. | Lookup path selection, node shape, child representation, subtree behavior. |
|
||||||
|
| `frequency-trie` | Algorithms and corner cases specific to frequency-aware trie logic. | Ranking, weighted reductions, persistence of weighted nodes. |
|
||||||
|
| `stemmer` | End-user stemming pipeline semantics. | Parse-encode-apply flows and output invariants. |
|
||||||
|
| `patch` | Patch encoding, decoding, and application semantics. | `PatchCommandEncoder` behavior and related compatibility contracts. |
|
||||||
|
| `io` | Input/output and resource loading boundaries. | Filesystem readers, streams, and stream lifecycle handling. |
|
||||||
|
| `serialization` | Binary persistence contract of compiled artifacts. | Versioned format reads/writes and checksum/consistency checks. |
|
||||||
|
| `parser` | Dictionary and metadata parsing concerns. | Dictionary input parsing and malformed-source rejection. |
|
||||||
|
| `cli` | Command-line entrypoint and command orchestration behavior. | Compile CLI integration and CLI argument validation. |
|
||||||
|
| `metadata` | Trie metadata semantics, compatibility fields, and schema expectations. | Version flags, structural properties, and metadata round-trips. |
|
||||||
|
| `compile` | Compile-time pipeline and build-oriented behavior. | Building, reduction-mode behavior, and compiled artifact generation. |
|
||||||
|
| `diacritic` | Unicode diacritic normalization and stripping behavior. | Accent-removal correctness and locale-safe normalization checks. |
|
||||||
|
|
||||||
|
## Canonical intent tags
|
||||||
|
|
||||||
|
| Tag | Description | Typical usage |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `construction` | Tests around construction and assembly of runtime structures. | Builders, loaders, and compile-time object construction contracts. |
|
||||||
|
| `lookup` | Read behavior and retrieval semantics. | `get()`, `getAll()`, traversal and missing-key behavior. |
|
||||||
|
| `persistence` | Storage lifecycle semantics. | Serialization/deserialization and round-trip correctness. |
|
||||||
|
| `reduction` | Reduction algorithm correctness and corner cases. | Dominance threshold, subtree deduplication, rank-preservation invariants. |
|
||||||
|
| `encoding` | Encoding transformation direction. | `PatchCommandEncoder.encode` and serialized command form generation. |
|
||||||
|
| `decoding` | Decoding/interpretation of persisted or runtime commands. | Optional consumers that parse and apply encoded command payloads. |
|
||||||
|
| `apply` | Patch application and transformation behavior. | Verifies that applied patches produce expected derived forms. |
|
||||||
|
| `normalization` | Canonicalization and cleanup behavior. | String normalization around case/shape and mirrored input paths. |
|
||||||
|
| `validation` | Input rejection and defensive checks. | Null/empty/invalid contracts and explicit failure conditions. |
|
||||||
|
| `regression` | Guard tests for behavior changes over time. | Known historical bugs and behavioral drift prevention. |
|
||||||
|
| `determinism` | Repeatable results under fixed input and settings. | Compile determinism, stable ordering, and artifact reproducibility. |
|
||||||
|
| `error-handling` | Exception surface and robustness expectations. | Recovery/failure modes and diagnostics quality. |
|
||||||
|
|
||||||
|
## Class-level rules
|
||||||
|
|
||||||
|
1. Every test class has **exactly one** scope tag.
|
||||||
|
2. Every test class has at least one domain tag.
|
||||||
|
3. Additional tags describe intent and may be used on classes or nested tests.
|
||||||
|
4. For each test class, intent tags should reflect the primary behavior under test, not historical naming conventions.
|
||||||
|
|
||||||
|
## Governance and execution policy
|
||||||
|
|
||||||
|
The following rules are used to keep the suite auditable and stable:
|
||||||
|
|
||||||
|
| Rule | Required state | Why |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| Scope discipline | Exactly one scope tag per class. | Prevents accidental promotion of integration-only behavior into fast unit runs. |
|
||||||
|
| Coverage breadth | At least one domain tag per class. | Ensures tests can be grouped by subsystem for targeted review. |
|
||||||
|
| Intent specificity | Use at least one intent tag when behavior is non-trivial. | Makes failure triage faster and profile composition explicit. |
|
||||||
|
| Runtime policy | Never run `slow` tests in the default `unit` profile unless explicitly required. | Preserves turnaround for PR feedback while preserving deep checks. |
|
||||||
|
| Change risk | Any persistence or compatibility-affecting change must include `compat` in validation. | Protects long-lived binary artifact contracts. |
|
||||||
|
| Mutation resistance | `fuzz`/`property` sets should be gated to dedicated profiles. | Limits flakiness exposure and controls CI resource cost. |
|
||||||
|
|
||||||
|
## Suggested CI profiles
|
||||||
|
|
||||||
|
These are recommended launch profiles for local and CI usage and are also exposed as Gradle tasks:
|
||||||
|
|
||||||
|
- **Profile: `ci-smoke` (fast feedback):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=unit -DexcludeTags=slow
|
||||||
|
./gradlew ciSmoke
|
||||||
|
```
|
||||||
|
|
||||||
|
`ciSmoke` also excludes `org.egothor.stemmer.CompileIntegrationTest*` at test-name filter level as a
|
||||||
|
defensive fallback in case of future tag drift.
|
||||||
|
`ciRelease` also excludes
|
||||||
|
`org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*` at filter level.
|
||||||
|
|
||||||
|
- **Profile: `ci-core` (core behavioral coverage):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=unit,trie,frequency-trie,property
|
||||||
|
./gradlew ciCore
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Profile: `ci-integration` (pipeline correctness):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=integration
|
||||||
|
./gradlew ciIntegration
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Profile: `ci-slow` (explicit heavy validation):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew ciSlow
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Profile: `ci-compat` (artifact stability):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=compat,regression
|
||||||
|
./gradlew ciCompat
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Profile: `ci-release` (strong confidence before release):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DexcludeTags=slow
|
||||||
|
./gradlew ciRelease
|
||||||
|
```
|
||||||
|
`ciRelease` is non-slow by policy and uses the same defensive name-based exclusion for
|
||||||
|
`org.egothor.stemmer.CompileIntegrationTest*` and
|
||||||
|
`org.egothor.stemmer.StemmerPatchTrieLoaderTest$BundledDictionaryTests*` in addition to tag filtering.
|
||||||
|
|
||||||
|
- **Profile: `ci-nightly` (extended hardening):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=fuzz
|
||||||
|
./gradlew ciNightly
|
||||||
|
```
|
||||||
|
|
||||||
|
- **Profile: `ci` (enterprise umbrella):**
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew ci
|
||||||
|
```
|
||||||
|
|
||||||
|
`ci` and `ciRelease` intentionally do **not** include `slow` paths. Run `ciSlow` explicitly for production-dictionary stress and long-running corpus checks.
|
||||||
|
|
||||||
|
## Practical examples
|
||||||
|
|
||||||
|
All examples use Gradle with JUnit Platform integration:
|
||||||
|
|
||||||
|
- Only unit tests:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=unit
|
||||||
|
```
|
||||||
|
|
||||||
|
- Integration tests only:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=integration
|
||||||
|
```
|
||||||
|
|
||||||
|
- Only trie subsystem tests:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=trie
|
||||||
|
```
|
||||||
|
|
||||||
|
- Deterministic fuzz checks:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=fuzz
|
||||||
|
```
|
||||||
|
|
||||||
|
- Property tests:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=property
|
||||||
|
```
|
||||||
|
|
||||||
|
- Stemmer + patch command behavior:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=stemmer,patch
|
||||||
|
```
|
||||||
|
|
||||||
|
- Compatibility artifacts and regression checks:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=compat
|
||||||
|
```
|
||||||
|
|
||||||
|
- Keep regression suite and remove long-running cases:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=regression -DexcludeTags=slow
|
||||||
|
```
|
||||||
|
|
||||||
|
- Core + patch behavior:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=trie,patch
|
||||||
|
```
|
||||||
|
|
||||||
|
- Deterministic compatibility and persistence checks:
|
||||||
|
|
||||||
|
```
|
||||||
|
./gradlew test -DincludeTags=compat,determinism,serialization
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- `-DincludeTags` and `-DexcludeTags` are interpreted by Gradle task filtering and forwarded into
|
||||||
|
JUnit tag filtering.
|
||||||
|
- Class-name filtering is also available via Gradle test selectors where needed
|
||||||
|
(for example, `--tests *CompileTest`), but tag filtering remains the default
|
||||||
|
execution strategy.
|
||||||
|
- `-DincludeTags` supports comma-separated literal tags. When you need a single exact tag with special
|
||||||
|
characters, quote the argument for the shell.
|
||||||
@@ -84,7 +84,7 @@ publishing {
|
|||||||
}
|
}
|
||||||
|
|
||||||
signing {
|
signing {
|
||||||
required { !version.toString().endsWith('-SNAPSHOT') }
|
required = !version.toString().endsWith('-SNAPSHOT')
|
||||||
if (signingKey != null && !signingKey.isBlank()) {
|
if (signingKey != null && !signingKey.isBlank()) {
|
||||||
useInMemoryPgpKeys(signingKey, signingPassword)
|
useInMemoryPgpKeys(signingKey, signingPassword)
|
||||||
sign publishing.publications.mavenJava
|
sign publishing.publications.mavenJava
|
||||||
|
|||||||
@@ -54,6 +54,7 @@ nav:
|
|||||||
- Overview: architecture-and-reduction.md
|
- Overview: architecture-and-reduction.md
|
||||||
- Architecture: architecture.md
|
- Architecture: architecture.md
|
||||||
- Reduction Semantics: reduction-semantics.md
|
- Reduction Semantics: reduction-semantics.md
|
||||||
|
- Lookup Edge Optimization: lookup-edge-optimization.md
|
||||||
- Compatibility and Guarantees: compatibility-and-guarantees.md
|
- Compatibility and Guarantees: compatibility-and-guarantees.md
|
||||||
|
|
||||||
- Dictionaries:
|
- Dictionaries:
|
||||||
@@ -63,3 +64,4 @@ nav:
|
|||||||
- Quality and Operations: quality-and-operations.md
|
- Quality and Operations: quality-and-operations.md
|
||||||
- Benchmarking: benchmarking.md
|
- Benchmarking: benchmarking.md
|
||||||
- Reports: reports.md
|
- Reports: reports.md
|
||||||
|
- Test taxonomy and execution filtering: test-taxonomy-and-filtering.md
|
||||||
|
|||||||
@@ -51,7 +51,6 @@ import java.util.logging.Logger;
|
|||||||
import org.egothor.stemmer.trie.CompiledNode;
|
import org.egothor.stemmer.trie.CompiledNode;
|
||||||
import org.egothor.stemmer.trie.LocalValueSummary;
|
import org.egothor.stemmer.trie.LocalValueSummary;
|
||||||
import org.egothor.stemmer.trie.MutableNode;
|
import org.egothor.stemmer.trie.MutableNode;
|
||||||
import org.egothor.stemmer.trie.NodeData;
|
|
||||||
import org.egothor.stemmer.trie.ReducedNode;
|
import org.egothor.stemmer.trie.ReducedNode;
|
||||||
import org.egothor.stemmer.trie.ReductionContext;
|
import org.egothor.stemmer.trie.ReductionContext;
|
||||||
import org.egothor.stemmer.trie.ReductionSignature;
|
import org.egothor.stemmer.trie.ReductionSignature;
|
||||||
@@ -87,7 +86,6 @@ import org.egothor.stemmer.trie.ReductionSignature;
|
|||||||
*
|
*
|
||||||
* @param <V> value type
|
* @param <V> value type
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("PMD.CyclomaticComplexity")
|
|
||||||
public final class FrequencyTrie<V> {
|
public final class FrequencyTrie<V> {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -130,11 +128,54 @@ public final class FrequencyTrie<V> {
|
|||||||
*/
|
*/
|
||||||
private static final int STREAM_MAGIC = 0x45475452;
|
private static final int STREAM_MAGIC = 0x45475452;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Minimum supported stream version constant retained for explicit range checks.
|
||||||
|
*/
|
||||||
|
private static final int MIN_STREAM_VERSION = 1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of stored values for which {@link #getEntries(String)} can return an
|
||||||
|
* empty result.
|
||||||
|
*/
|
||||||
|
private static final int NO_VALUE_COUNT = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of stored values for which {@link #getEntries(String)} can use a
|
||||||
|
* one-item immutable list special case.
|
||||||
|
*/
|
||||||
|
private static final int SINGLE_VALUE_COUNT = 1;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Binary format version.
|
* Binary format version.
|
||||||
*/
|
*/
|
||||||
private static final int STREAM_VERSION = 5;
|
private static final int STREAM_VERSION = 5;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Version where traversal-direction ordinal is persisted.
|
||||||
|
*/
|
||||||
|
private static final int TRAVERSAL_VERSION = 2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Version where compact reduction metadata is persisted.
|
||||||
|
*/
|
||||||
|
private static final int REDUCTION_VERSION = 3;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Version where case-processing mode ordinal is persisted.
|
||||||
|
*/
|
||||||
|
private static final int CASE_VERSION = 4;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default dense child lookup span in code points used when materializing
|
||||||
|
* compiled nodes without an explicit override.
|
||||||
|
* <p>
|
||||||
|
* Increasing this value increases the chance of direct array indexing for
|
||||||
|
* child lookup at runtime at the cost of per-node dense table memory for
|
||||||
|
* compact character spans.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public static final int DEFAULT_MAX_EXPANDED_INDEX = 512;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the current persisted binary stream format version.
|
* Returns the current persisted binary stream format version.
|
||||||
*
|
*
|
||||||
@@ -259,7 +300,6 @@ public final class FrequencyTrie<V> {
|
|||||||
* if the key does not exist or no value is stored at the addressed node
|
* if the key does not exist or no value is stored at the addressed node
|
||||||
* @throws NullPointerException if {@code key} is {@code null}
|
* @throws NullPointerException if {@code key} is {@code null}
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
|
|
||||||
public List<ValueCount<V>> getEntries(final String key) {
|
public List<ValueCount<V>> getEntries(final String key) {
|
||||||
Objects.requireNonNull(key, "key");
|
Objects.requireNonNull(key, "key");
|
||||||
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
|
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
|
||||||
@@ -269,11 +309,11 @@ public final class FrequencyTrie<V> {
|
|||||||
|
|
||||||
final V[] orderedValues = node.orderedValues();
|
final V[] orderedValues = node.orderedValues();
|
||||||
final int valueCount = orderedValues.length;
|
final int valueCount = orderedValues.length;
|
||||||
if (valueCount == 0) {
|
if (valueCount == NO_VALUE_COUNT) {
|
||||||
return List.of();
|
return List.of();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (valueCount == 1) {
|
if (valueCount == SINGLE_VALUE_COUNT) {
|
||||||
return List.of(new ValueCount<>(orderedValues[0], node.orderedCounts()[0]));
|
return List.of(new ValueCount<>(orderedValues[0], node.orderedCounts()[0]));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -383,47 +423,31 @@ public final class FrequencyTrie<V> {
|
|||||||
*/
|
*/
|
||||||
public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
|
public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
|
||||||
final ValueStreamCodec<V> valueCodec) throws IOException {
|
final ValueStreamCodec<V> valueCodec) throws IOException {
|
||||||
Objects.requireNonNull(inputStream, "inputStream");
|
return readFrom(inputStream, arrayFactory, valueCodec, -1);
|
||||||
Objects.requireNonNull(arrayFactory, "arrayFactory");
|
}
|
||||||
Objects.requireNonNull(valueCodec, "valueCodec");
|
|
||||||
|
|
||||||
final DataInputStream dataInput; // NOPMD
|
/**
|
||||||
if (inputStream instanceof DataInputStream) {
|
* Reads a compiled trie from the supplied input stream, optionally overriding
|
||||||
dataInput = (DataInputStream) inputStream;
|
* dense child-index span configuration.
|
||||||
} else {
|
* <p>
|
||||||
dataInput = new DataInputStream(inputStream);
|
* This setting is applied only while materializing the in-memory compiled
|
||||||
}
|
* representation during load. It is not serialized in {@link TrieMetadata},
|
||||||
|
* so each load can independently choose its own runtime lookup trade-off.
|
||||||
final int magic = dataInput.readInt();
|
* </p>
|
||||||
if (magic != STREAM_MAGIC) {
|
*
|
||||||
throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic));
|
* @param inputStream source input stream
|
||||||
}
|
* @param arrayFactory array factory used to create typed arrays
|
||||||
|
* @param valueCodec codec used to read values
|
||||||
final int version = dataInput.readInt();
|
* @param maxExpandedIndex dense lookup span override; zero disables dense lookup,
|
||||||
if (version != 1 && version != 3 && version != 4 && version != STREAM_VERSION) {
|
* negative values use {@link #DEFAULT_MAX_EXPANDED_INDEX}
|
||||||
throw new IOException("Unsupported trie stream version: " + version);
|
* @param <V> value type
|
||||||
}
|
* @return deserialized compiled trie
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
final int nodeCount = dataInput.readInt();
|
* @throws IOException if reading fails or the binary format is invalid
|
||||||
if (nodeCount < 0) {
|
*/
|
||||||
throw new IOException("Negative node count: " + nodeCount);
|
public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
|
||||||
}
|
final ValueStreamCodec<V> valueCodec, final int maxExpandedIndex) throws IOException {
|
||||||
|
return CompiledTrieReader.read(inputStream, arrayFactory, valueCodec, maxExpandedIndex);
|
||||||
final int rootNodeId = dataInput.readInt();
|
|
||||||
if (rootNodeId < 0 || rootNodeId >= nodeCount) {
|
|
||||||
throw new IOException("Invalid root node id: " + rootNodeId);
|
|
||||||
}
|
|
||||||
|
|
||||||
final TrieMetadata metadata = readMetadata(dataInput, version);
|
|
||||||
|
|
||||||
final CompiledNode<V>[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount);
|
|
||||||
final CompiledNode<V> rootNode = nodes[rootNodeId];
|
|
||||||
|
|
||||||
if (LOGGER.isLoggable(Level.FINE)) {
|
|
||||||
LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new FrequencyTrie<>(arrayFactory, rootNode, metadata);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -438,73 +462,6 @@ public final class FrequencyTrie<V> {
|
|||||||
dataOutput.writeUTF(metadata.toTextBlock());
|
dataOutput.writeUTF(metadata.toTextBlock());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads persisted trie metadata while remaining backward compatible with
|
|
||||||
* earlier stream versions.
|
|
||||||
*
|
|
||||||
* @param dataInput input stream
|
|
||||||
* @param version persisted stream version
|
|
||||||
* @return deserialized metadata
|
|
||||||
* @throws IOException if the metadata section is invalid
|
|
||||||
*/
|
|
||||||
private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException {
|
|
||||||
if (version >= 5) { // NOPMD
|
|
||||||
try {
|
|
||||||
return TrieMetadata.fromTextBlock(version, dataInput.readUTF());
|
|
||||||
} catch (IllegalArgumentException exception) {
|
|
||||||
throw new IOException("Invalid metadata block.", exception);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
final WordTraversalDirection traversalDirection;
|
|
||||||
if (version >= 2) { // NOPMD
|
|
||||||
final int traversalDirectionOrdinal = dataInput.readInt();
|
|
||||||
final WordTraversalDirection[] traversalDirections = WordTraversalDirection.values();
|
|
||||||
if (traversalDirectionOrdinal < 0 || traversalDirectionOrdinal >= traversalDirections.length) {
|
|
||||||
throw new IOException("Invalid traversal direction ordinal: " + traversalDirectionOrdinal);
|
|
||||||
}
|
|
||||||
traversalDirection = traversalDirections[traversalDirectionOrdinal];
|
|
||||||
} else {
|
|
||||||
traversalDirection = WordTraversalDirection.BACKWARD;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (version < 3) { // NOPMD
|
|
||||||
return TrieMetadata.legacy(version, traversalDirection);
|
|
||||||
}
|
|
||||||
|
|
||||||
final ReductionMode[] reductionModes = ReductionMode.values();
|
|
||||||
final int reductionModeOrdinal = dataInput.readInt();
|
|
||||||
if (reductionModeOrdinal < 0 || reductionModeOrdinal >= reductionModes.length) {
|
|
||||||
throw new IOException("Invalid reduction mode ordinal: " + reductionModeOrdinal);
|
|
||||||
}
|
|
||||||
|
|
||||||
final int dominantWinnerMinPercent = dataInput.readInt();
|
|
||||||
final int dominantWinnerOverSecondRatio = dataInput.readInt(); // NOPMD
|
|
||||||
|
|
||||||
final DiacriticProcessingMode[] diacriticProcessingModes = DiacriticProcessingMode.values();
|
|
||||||
final int diacriticProcessingModeOrdinal = dataInput.readInt(); // NOPMD
|
|
||||||
if (diacriticProcessingModeOrdinal < 0 || diacriticProcessingModeOrdinal >= diacriticProcessingModes.length) {
|
|
||||||
throw new IOException("Invalid diacritic processing mode ordinal: " + diacriticProcessingModeOrdinal);
|
|
||||||
}
|
|
||||||
|
|
||||||
final CaseProcessingMode caseProcessingMode;
|
|
||||||
if (version >= 4) { // NOPMD
|
|
||||||
final CaseProcessingMode[] caseProcessingModes = CaseProcessingMode.values();
|
|
||||||
final int caseProcessingModeOrdinal = dataInput.readInt();
|
|
||||||
if (caseProcessingModeOrdinal < 0 || caseProcessingModeOrdinal >= caseProcessingModes.length) {
|
|
||||||
throw new IOException("Invalid case processing mode ordinal: " + caseProcessingModeOrdinal);
|
|
||||||
}
|
|
||||||
caseProcessingMode = caseProcessingModes[caseProcessingModeOrdinal];
|
|
||||||
} else {
|
|
||||||
caseProcessingMode = CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
|
|
||||||
}
|
|
||||||
|
|
||||||
return new TrieMetadata(version, traversalDirection,
|
|
||||||
new ReductionSettings(reductionModes[reductionModeOrdinal], dominantWinnerMinPercent,
|
|
||||||
dominantWinnerOverSecondRatio),
|
|
||||||
diacriticProcessingModes[diacriticProcessingModeOrdinal], caseProcessingMode);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the number of canonical compiled nodes reachable from the root.
|
* Returns the number of canonical compiled nodes reachable from the root.
|
||||||
*
|
*
|
||||||
@@ -574,103 +531,218 @@ public final class FrequencyTrie<V> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads all compiled nodes and resolves child references.
|
* Internal helper that materializes serialized trie data.
|
||||||
*
|
|
||||||
* @param dataInput input
|
|
||||||
* @param arrayFactory array factory
|
|
||||||
* @param valueCodec value codec
|
|
||||||
* @param nodeCount number of nodes
|
|
||||||
* @param <V> value type
|
|
||||||
* @return array of nodes indexed by serialized node identifier
|
|
||||||
* @throws IOException if reading fails or the stream is invalid
|
|
||||||
*/
|
|
||||||
@SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops")
|
|
||||||
private static <V> CompiledNode<V>[] readNodes(final DataInputStream dataInput, final IntFunction<V[]> arrayFactory,
|
|
||||||
final ValueStreamCodec<V> valueCodec, final int nodeCount) throws IOException {
|
|
||||||
final List<NodeData<V>> nodeDataList = new ArrayList<>(nodeCount);
|
|
||||||
|
|
||||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
|
||||||
final int edgeCount = dataInput.readInt();
|
|
||||||
if (edgeCount < 0) {
|
|
||||||
throw new IOException("Negative edge count at node " + nodeIndex + ": " + edgeCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
final char[] edgeLabels = new char[edgeCount];
|
|
||||||
final int[] childNodeIds = new int[edgeCount];
|
|
||||||
|
|
||||||
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
|
|
||||||
edgeLabels[edgeIndex] = dataInput.readChar();
|
|
||||||
childNodeIds[edgeIndex] = dataInput.readInt();
|
|
||||||
}
|
|
||||||
|
|
||||||
validateSerializedEdges(nodeIndex, edgeLabels);
|
|
||||||
|
|
||||||
final int valueCount = dataInput.readInt();
|
|
||||||
if (valueCount < 0) {
|
|
||||||
throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
final V[] orderedValues = arrayFactory.apply(valueCount);
|
|
||||||
final int[] orderedCounts = new int[valueCount];
|
|
||||||
|
|
||||||
for (int valueIndex = 0; valueIndex < valueCount; valueIndex++) {
|
|
||||||
orderedValues[valueIndex] = valueCodec.read(dataInput);
|
|
||||||
orderedCounts[valueIndex] = dataInput.readInt();
|
|
||||||
if (orderedCounts[valueIndex] <= 0) {
|
|
||||||
throw new IOException("Non-positive stored count at node " + nodeIndex + ", value index "
|
|
||||||
+ valueIndex + ": " + orderedCounts[valueIndex]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
nodeDataList.add(new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts));
|
|
||||||
}
|
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
final CompiledNode<V>[] nodes = new CompiledNode[nodeCount];
|
|
||||||
|
|
||||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
|
||||||
final NodeData<V> nodeData = nodeDataList.get(nodeIndex);
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
final CompiledNode<V>[] children = new CompiledNode[nodeData.childNodeIds().length];
|
|
||||||
nodes[nodeIndex] = new CompiledNode<>(nodeData.edgeLabels(), children, nodeData.orderedValues(),
|
|
||||||
nodeData.orderedCounts());
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
|
||||||
final NodeData<V> nodeData = nodeDataList.get(nodeIndex);
|
|
||||||
final CompiledNode<V> node = nodes[nodeIndex];
|
|
||||||
|
|
||||||
for (int edgeIndex = 0; edgeIndex < nodeData.childNodeIds().length; edgeIndex++) {
|
|
||||||
final int childNodeId = nodeData.childNodeIds()[edgeIndex];
|
|
||||||
if (childNodeId < 0 || childNodeId >= nodeCount) {
|
|
||||||
throw new IOException("Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex
|
|
||||||
+ ": " + childNodeId);
|
|
||||||
}
|
|
||||||
node.children()[edgeIndex] = nodes[childNodeId];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nodes;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Validates the serialized edge-label sequence for one node.
|
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* Compiled nodes rely on binary search for child lookup and therefore require
|
* Moving reader complexity into this helper keeps the public-facing class from
|
||||||
* edge labels to be stored in strict ascending order without duplicates.
|
* accumulating excessive class-level cyclomatic complexity while preserving the
|
||||||
* Rejecting malformed streams here keeps lookup semantics deterministic and
|
* same binary compatibility contract.
|
||||||
* avoids silently constructing a trie whose search behavior would be undefined.
|
* </p>
|
||||||
*
|
|
||||||
* @param nodeIndex serialized node identifier
|
|
||||||
* @param edgeLabels serialized edge labels
|
|
||||||
* @throws IOException if the edge labels are not strictly ascending
|
|
||||||
*/
|
*/
|
||||||
private static void validateSerializedEdges(final int nodeIndex, final char... edgeLabels) throws IOException {
|
private static final class CompiledTrieReader {
|
||||||
for (int edgeIndex = 1; edgeIndex < edgeLabels.length; edgeIndex++) {
|
|
||||||
if (edgeLabels[edgeIndex - 1] >= edgeLabels[edgeIndex]) {
|
private static <V> FrequencyTrie<V> read(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
|
||||||
throw new IOException("Edge labels must be strictly ascending at node " + nodeIndex + ", edge index "
|
final ValueStreamCodec<V> valueCodec, final int maxExpandedIndex) throws IOException {
|
||||||
+ edgeIndex + ": '" + edgeLabels[edgeIndex - 1] + "' then '" + edgeLabels[edgeIndex] + "'.");
|
Objects.requireNonNull(inputStream, "inputStream");
|
||||||
|
Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||||
|
Objects.requireNonNull(valueCodec, "valueCodec");
|
||||||
|
if (maxExpandedIndex < -1) {
|
||||||
|
throw new IllegalArgumentException("maxExpandedIndex must be >= -1.");
|
||||||
|
}
|
||||||
|
|
||||||
|
final DataInputStream dataInput = wrapInputStream(inputStream);
|
||||||
|
final int magic = dataInput.readInt();
|
||||||
|
if (magic != STREAM_MAGIC) {
|
||||||
|
throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic));
|
||||||
|
}
|
||||||
|
|
||||||
|
final int version = dataInput.readInt();
|
||||||
|
if (version < MIN_STREAM_VERSION || version > STREAM_VERSION) {
|
||||||
|
throw new IOException("Unsupported trie stream version: " + version);
|
||||||
|
}
|
||||||
|
|
||||||
|
final int nodeCount = dataInput.readInt();
|
||||||
|
if (nodeCount < 0) {
|
||||||
|
throw new IOException("Negative node count: " + nodeCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
final int rootNodeId = dataInput.readInt();
|
||||||
|
if (rootNodeId < 0 || rootNodeId >= nodeCount) {
|
||||||
|
throw new IOException("Invalid root node id: " + rootNodeId);
|
||||||
|
}
|
||||||
|
|
||||||
|
final TrieMetadata sourceMetadata = readMetadata(dataInput, version);
|
||||||
|
final int effectiveMaxExpandedIndex = maxExpandedIndex >= 0 ? maxExpandedIndex : DEFAULT_MAX_EXPANDED_INDEX;
|
||||||
|
final CompiledNode<V>[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount, effectiveMaxExpandedIndex);
|
||||||
|
final CompiledNode<V> rootNode = nodes[rootNodeId];
|
||||||
|
|
||||||
|
if (LOGGER.isLoggable(Level.FINE)) {
|
||||||
|
LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new FrequencyTrie<>(arrayFactory, rootNode, sourceMetadata);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static DataInputStream wrapInputStream(final InputStream inputStream) {
|
||||||
|
return inputStream instanceof DataInputStream
|
||||||
|
? (DataInputStream) inputStream
|
||||||
|
: new DataInputStream(inputStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static TrieMetadata readMetadata(final DataInputStream dataInput, final int version) throws IOException {
|
||||||
|
if (version == STREAM_VERSION) {
|
||||||
|
return readTextMetadata(dataInput);
|
||||||
|
}
|
||||||
|
|
||||||
|
final WordTraversalDirection traversalDirection = readTraversalDirection(dataInput, version);
|
||||||
|
if (version < REDUCTION_VERSION) {
|
||||||
|
return TrieMetadata.legacy(version, traversalDirection);
|
||||||
|
}
|
||||||
|
|
||||||
|
final ReductionSettings reductionSettings = readReductionSettings(dataInput);
|
||||||
|
final DiacriticProcessingMode diacriticProcessingMode = readEnumByOrdinal(dataInput, DiacriticProcessingMode.values(),
|
||||||
|
"diacritic processing mode");
|
||||||
|
final CaseProcessingMode caseProcessingMode = version >= CASE_VERSION
|
||||||
|
? readCaseProcessingMode(dataInput)
|
||||||
|
: CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
|
||||||
|
return new TrieMetadata(version, traversalDirection, reductionSettings, diacriticProcessingMode, caseProcessingMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static TrieMetadata readTextMetadata(final DataInputStream dataInput) throws IOException {
|
||||||
|
try {
|
||||||
|
return TrieMetadata.fromTextBlock(STREAM_VERSION, dataInput.readUTF());
|
||||||
|
} catch (IllegalArgumentException exception) {
|
||||||
|
throw new IOException("Invalid metadata block.", exception);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static WordTraversalDirection readTraversalDirection(final DataInputStream dataInput, final int version)
|
||||||
|
throws IOException {
|
||||||
|
if (version < TRAVERSAL_VERSION) {
|
||||||
|
return WordTraversalDirection.BACKWARD;
|
||||||
|
}
|
||||||
|
return readEnumByOrdinal(dataInput, WordTraversalDirection.values(), "traversal direction");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ReductionSettings readReductionSettings(final DataInputStream dataInput) throws IOException {
|
||||||
|
final ReductionMode reductionMode = readEnumByOrdinal(dataInput, ReductionMode.values(), "reduction mode");
|
||||||
|
final int dominantWinnerMinPercent = dataInput.readInt();
|
||||||
|
final int dominantWinnerOverSecondRatio = dataInput.readInt(); // NOPMD
|
||||||
|
return new ReductionSettings(reductionMode, dominantWinnerMinPercent, dominantWinnerOverSecondRatio);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static CaseProcessingMode readCaseProcessingMode(final DataInputStream dataInput) throws IOException {
|
||||||
|
return readEnumByOrdinal(dataInput, CaseProcessingMode.values(), "case processing mode");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <E extends Enum<E>> E readEnumByOrdinal(final DataInputStream dataInput, final E[] values,
|
||||||
|
final String name) throws IOException {
|
||||||
|
final int ordinal = dataInput.readInt();
|
||||||
|
if (ordinal < 0 || ordinal >= values.length) {
|
||||||
|
throw new IOException("Invalid " + name + " ordinal: " + ordinal);
|
||||||
|
}
|
||||||
|
return values[ordinal];
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <V> CompiledNode<V>[] readNodes(final DataInputStream dataInput, final IntFunction<V[]> arrayFactory,
|
||||||
|
final ValueStreamCodec<V> valueCodec, final int nodeCount, final int maxExpandedIndex) throws IOException {
|
||||||
|
final char[][] edgeLabelsByNode = new char[nodeCount][];
|
||||||
|
final int[][] childNodeIdsByNode = new int[nodeCount][];
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
final V[][] orderedValuesByNode = (V[][]) new Object[nodeCount][];
|
||||||
|
final int[][] orderedCountsByNode = new int[nodeCount][];
|
||||||
|
|
||||||
|
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
||||||
|
final int edgeCount = dataInput.readInt();
|
||||||
|
if (edgeCount < 0) {
|
||||||
|
throw new IOException("Negative edge count at node " + nodeIndex + ": " + edgeCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
edgeLabelsByNode[nodeIndex] = new char[edgeCount];
|
||||||
|
childNodeIdsByNode[nodeIndex] = new int[edgeCount];
|
||||||
|
|
||||||
|
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
|
||||||
|
edgeLabelsByNode[nodeIndex][edgeIndex] = dataInput.readChar();
|
||||||
|
childNodeIdsByNode[nodeIndex][edgeIndex] = dataInput.readInt();
|
||||||
|
}
|
||||||
|
|
||||||
|
validateSerializedEdges(nodeIndex, edgeLabelsByNode[nodeIndex]);
|
||||||
|
|
||||||
|
final int valueCount = dataInput.readInt();
|
||||||
|
if (valueCount < 0) {
|
||||||
|
throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
orderedValuesByNode[nodeIndex] = arrayFactory.apply(valueCount);
|
||||||
|
orderedCountsByNode[nodeIndex] = new int[valueCount];
|
||||||
|
|
||||||
|
for (int valueIndex = 0; valueIndex < valueCount; valueIndex++) {
|
||||||
|
orderedValuesByNode[nodeIndex][valueIndex] = valueCodec.read(dataInput);
|
||||||
|
orderedCountsByNode[nodeIndex][valueIndex] = dataInput.readInt();
|
||||||
|
if (orderedCountsByNode[nodeIndex][valueIndex] <= 0) {
|
||||||
|
throw new IOException("Non-positive stored count at node " + nodeIndex + ", value index "
|
||||||
|
+ valueIndex + ": " + orderedCountsByNode[nodeIndex][valueIndex]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
final CompiledNode<V>[] nodes = new CompiledNode[nodeCount];
|
||||||
|
final boolean[] inProgress = new boolean[nodeCount];
|
||||||
|
|
||||||
|
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
|
||||||
|
nodes[nodeIndex] = resolveNode(nodeIndex, edgeLabelsByNode, childNodeIdsByNode, orderedValuesByNode,
|
||||||
|
orderedCountsByNode, nodes, inProgress, maxExpandedIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
return nodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <V> CompiledNode<V> resolveNode(final int nodeIndex, final char[][] edgeLabelsByNode,
|
||||||
|
final int[][] childNodeIdsByNode, final V[][] orderedValuesByNode, final int[][] orderedCountsByNode,
|
||||||
|
final CompiledNode<V>[] nodes, final boolean[] inProgress, final int maxExpandedIndex) throws IOException {
|
||||||
|
final CompiledNode<V> cachedNode = nodes[nodeIndex];
|
||||||
|
if (cachedNode != null) {
|
||||||
|
return cachedNode;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inProgress[nodeIndex]) {
|
||||||
|
throw new IOException("Invalid serialized node graph: cyclic reference detected at node " + nodeIndex + '.');
|
||||||
|
}
|
||||||
|
inProgress[nodeIndex] = true;
|
||||||
|
try {
|
||||||
|
final char[] edgeLabels = edgeLabelsByNode[nodeIndex];
|
||||||
|
final int[] childNodeIds = childNodeIdsByNode[nodeIndex];
|
||||||
|
final int edgeCount = childNodeIds.length;
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
final CompiledNode<V>[] children = new CompiledNode[edgeCount];
|
||||||
|
|
||||||
|
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
|
||||||
|
final int childNodeId = childNodeIds[edgeIndex];
|
||||||
|
if (childNodeId < 0 || childNodeId >= edgeLabelsByNode.length) {
|
||||||
|
throw new IOException(
|
||||||
|
"Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex + ": "
|
||||||
|
+ childNodeId);
|
||||||
|
}
|
||||||
|
children[edgeIndex] = resolveNode(childNodeId, edgeLabelsByNode, childNodeIdsByNode,
|
||||||
|
orderedValuesByNode, orderedCountsByNode, nodes, inProgress, maxExpandedIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
final CompiledNode<V> node = new CompiledNode<>(edgeLabels, children, orderedValuesByNode[nodeIndex], maxExpandedIndex,
|
||||||
|
orderedCountsByNode[nodeIndex]);
|
||||||
|
nodes[nodeIndex] = node;
|
||||||
|
return node;
|
||||||
|
} finally {
|
||||||
|
inProgress[nodeIndex] = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void validateSerializedEdges(final int nodeIndex, final char... edgeLabels) throws IOException {
|
||||||
|
for (int edgeIndex = 1; edgeIndex < edgeLabels.length; edgeIndex++) {
|
||||||
|
if (edgeLabels[edgeIndex - 1] >= edgeLabels[edgeIndex]) {
|
||||||
|
throw new IOException("Edge labels must be strictly ascending at node " + nodeIndex + ", edge index "
|
||||||
|
+ edgeIndex + ": '" + edgeLabels[edgeIndex - 1] + "' then '" + edgeLabels[edgeIndex] + "'.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -771,6 +843,16 @@ public final class FrequencyTrie<V> {
|
|||||||
*/
|
*/
|
||||||
private final DiacriticProcessingMode diacriticProcessingMode;
|
private final DiacriticProcessingMode diacriticProcessingMode;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dense edge lookup span threshold.
|
||||||
|
* <p>
|
||||||
|
* This value controls a speed/memory trade-off during freezing:
|
||||||
|
* dense child lookup tables are allocated only for nodes whose child
|
||||||
|
* labels fit in this span.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
private final int maxExpandedIndex;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Mutable root node.
|
* Mutable root node.
|
||||||
*/
|
*/
|
||||||
@@ -837,11 +919,39 @@ public final class FrequencyTrie<V> {
|
|||||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||||
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
|
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
|
||||||
final DiacriticProcessingMode diacriticProcessingMode) {
|
final DiacriticProcessingMode diacriticProcessingMode) {
|
||||||
|
this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, diacriticProcessingMode,
|
||||||
|
CompiledNode.DEFAULT_MAX_EXPANDED_INDEX);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new builder with the provided settings, explicit traversal
|
||||||
|
* direction, explicit case processing mode, explicit diacritic processing
|
||||||
|
* mode, and an explicit dense child lookup threshold.
|
||||||
|
*
|
||||||
|
* @param arrayFactory array factory
|
||||||
|
* @param reductionSettings reduction configuration
|
||||||
|
* @param traversalDirection logical key traversal direction
|
||||||
|
* @param caseProcessingMode dictionary case processing mode
|
||||||
|
* @param diacriticProcessingMode dictionary diacritic processing mode
|
||||||
|
* @param maxExpandedIndex dense lookup span override; zero disables
|
||||||
|
* dense lookup. Larger values increase direct
|
||||||
|
* indexing opportunities while potentially
|
||||||
|
* increasing materialization memory in nodes
|
||||||
|
* whose edge label span is within the limit.
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
*/
|
||||||
|
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||||
|
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
|
||||||
|
final DiacriticProcessingMode diacriticProcessingMode, final int maxExpandedIndex) {
|
||||||
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||||
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||||
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
|
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||||
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||||
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
|
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
|
||||||
|
if (maxExpandedIndex < 0) {
|
||||||
|
throw new IllegalArgumentException("maxExpandedIndex must be non-negative.");
|
||||||
|
}
|
||||||
|
this.maxExpandedIndex = maxExpandedIndex;
|
||||||
this.root = new MutableNode<>();
|
this.root = new MutableNode<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1098,7 +1208,7 @@ public final class FrequencyTrie<V> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
final CompiledNode<V> frozen = new CompiledNode<>(edges, childNodes, localSummary.orderedValues(),
|
final CompiledNode<V> frozen = new CompiledNode<>(edges, childNodes, localSummary.orderedValues(),
|
||||||
localSummary.orderedCounts());
|
this.maxExpandedIndex, localSummary.orderedCounts());
|
||||||
cache.put(reducedNode, frozen);
|
cache.put(reducedNode, frozen);
|
||||||
return frozen;
|
return frozen;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -94,6 +94,29 @@ public final class StemmerPatchTrieBinaryIO {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a GZip-compressed binary patch-command trie from a filesystem path
|
||||||
|
* with an optional dense child lookup span override.
|
||||||
|
* <p>
|
||||||
|
* This is a runtime-only tuning parameter. The dense-span setting is not
|
||||||
|
* persisted in the file and does not change the compiled metadata.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param path source file
|
||||||
|
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||||
|
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||||
|
* @return deserialized trie
|
||||||
|
* @throws NullPointerException if {@code path} is {@code null}
|
||||||
|
* @throws IOException if reading or decompression fails
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> read(final Path path, final int maxExpandedIndex) throws IOException {
|
||||||
|
Objects.requireNonNull(path, "path");
|
||||||
|
|
||||||
|
try (InputStream fileInputStream = Files.newInputStream(path)) {
|
||||||
|
return read(fileInputStream, maxExpandedIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads a GZip-compressed binary patch-command trie from a filesystem path
|
* Reads a GZip-compressed binary patch-command trie from a filesystem path
|
||||||
* string.
|
* string.
|
||||||
@@ -108,6 +131,26 @@ public final class StemmerPatchTrieBinaryIO {
|
|||||||
return read(Path.of(fileName));
|
return read(Path.of(fileName));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a GZip-compressed binary patch-command trie from a filesystem path
|
||||||
|
* string with an optional dense child lookup span override.
|
||||||
|
* <p>
|
||||||
|
* This is a runtime-only tuning parameter. The dense-span setting is not
|
||||||
|
* persisted in the file and does not change the compiled metadata.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param fileName source file name or path string
|
||||||
|
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||||
|
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||||
|
* @return deserialized trie
|
||||||
|
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||||
|
* @throws IOException if reading or decompression fails
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> read(final String fileName, final int maxExpandedIndex) throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, "fileName");
|
||||||
|
return read(Path.of(fileName), maxExpandedIndex);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads a GZip-compressed binary patch-command trie from an input stream.
|
* Reads a GZip-compressed binary patch-command trie from an input stream.
|
||||||
*
|
*
|
||||||
@@ -132,6 +175,34 @@ public final class StemmerPatchTrieBinaryIO {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a GZip-compressed binary patch-command trie from an input stream with
|
||||||
|
* an optional dense child lookup span override.
|
||||||
|
* <p>
|
||||||
|
* This is a runtime-only tuning parameter. The dense-span setting is not
|
||||||
|
* persisted in the file and does not change the compiled metadata.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param inputStream source stream
|
||||||
|
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||||
|
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||||
|
* @return deserialized trie
|
||||||
|
* @throws NullPointerException if {@code inputStream} is {@code null}
|
||||||
|
* @throws IOException if reading or decompression fails
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> read(final InputStream inputStream, final int maxExpandedIndex) throws IOException {
|
||||||
|
Objects.requireNonNull(inputStream, "inputStream");
|
||||||
|
|
||||||
|
try (GZIPInputStream gzipInputStream = new GZIPInputStream(new BufferedInputStream(inputStream));
|
||||||
|
DataInputStream dataInputStream = new DataInputStream(gzipInputStream)) {
|
||||||
|
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(dataInputStream, String[]::new, STRING_CODEC,
|
||||||
|
maxExpandedIndex);
|
||||||
|
|
||||||
|
LOGGER.log(Level.FINE, "Read compressed binary stemmer trie.");
|
||||||
|
return trie;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads only metadata from a GZip-compressed binary patch-command trie stored
|
* Reads only metadata from a GZip-compressed binary patch-command trie stored
|
||||||
* at a filesystem path.
|
* at a filesystem path.
|
||||||
|
|||||||
@@ -71,6 +71,7 @@ import java.util.zip.GZIPInputStream;
|
|||||||
public final class StemmerPatchTrieLoader {
|
public final class StemmerPatchTrieLoader {
|
||||||
|
|
||||||
/* default */ static final String FILENAME_REQUIRED = "fileName required";
|
/* default */ static final String FILENAME_REQUIRED = "fileName required";
|
||||||
|
private static final String PARAMETER_PATH = "path";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Logger of this class.
|
* Logger of this class.
|
||||||
@@ -460,8 +461,8 @@ public final class StemmerPatchTrieLoader {
|
|||||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||||
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
|
final CaseProcessingMode caseProcessingMode, final DiacriticProcessingMode diacriticProcessingMode)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
Objects.requireNonNull(path, "path");
|
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||||
final TrieMetadata metadata = metadataForCompilation(traversalDirection, reductionSettings, caseProcessingMode,
|
final TrieMetadata metadata = metadataForCompilation(traversalDirection, reductionSettings, caseProcessingMode,
|
||||||
diacriticProcessingMode);
|
diacriticProcessingMode);
|
||||||
return load(path, storeOriginal, metadata);
|
return load(path, storeOriginal, metadata);
|
||||||
@@ -487,7 +488,7 @@ public final class StemmerPatchTrieLoader {
|
|||||||
*/
|
*/
|
||||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal, final TrieMetadata metadata)
|
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal, final TrieMetadata metadata)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
Objects.requireNonNull(path, "path");
|
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||||
Objects.requireNonNull(metadata, "metadata");
|
Objects.requireNonNull(metadata, "metadata");
|
||||||
|
|
||||||
try (InputStream inputStream = openDictionaryInputStream(path);
|
try (InputStream inputStream = openDictionaryInputStream(path);
|
||||||
@@ -759,10 +760,31 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* read
|
* read
|
||||||
*/
|
*/
|
||||||
public static FrequencyTrie<String> loadBinary(final Path path) throws IOException {
|
public static FrequencyTrie<String> loadBinary(final Path path) throws IOException {
|
||||||
Objects.requireNonNull(path, "path");
|
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||||
return StemmerPatchTrieBinaryIO.read(path);
|
return StemmerPatchTrieBinaryIO.read(path);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a GZip-compressed binary patch-command trie from a filesystem path
|
||||||
|
* using a custom dense lookup span override.
|
||||||
|
* <p>
|
||||||
|
* This is a runtime-only tuning parameter that does not affect persisted
|
||||||
|
* metadata.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param path path to the compressed binary trie file
|
||||||
|
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||||
|
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if {@code path} is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened, decompressed, or
|
||||||
|
* read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> loadBinary(final Path path, final int maxExpandedIndex) throws IOException {
|
||||||
|
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||||
|
return StemmerPatchTrieBinaryIO.read(path, maxExpandedIndex);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads a GZip-compressed binary patch-command trie from a filesystem path
|
* Loads a GZip-compressed binary patch-command trie from a filesystem path
|
||||||
* string.
|
* string.
|
||||||
@@ -778,6 +800,27 @@ public final class StemmerPatchTrieLoader {
|
|||||||
return StemmerPatchTrieBinaryIO.read(fileName);
|
return StemmerPatchTrieBinaryIO.read(fileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads a GZip-compressed binary patch-command trie from a filesystem path
|
||||||
|
* string using a custom dense lookup span override.
|
||||||
|
* <p>
|
||||||
|
* This is a runtime-only tuning parameter that does not affect persisted
|
||||||
|
* metadata.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param fileName file name or path string
|
||||||
|
* @param maxExpandedIndex dense lookup span override; negative values use
|
||||||
|
* {@link FrequencyTrie#DEFAULT_MAX_EXPANDED_INDEX}
|
||||||
|
* @return compiled patch-command trie
|
||||||
|
* @throws NullPointerException if {@code fileName} is {@code null}
|
||||||
|
* @throws IOException if the file cannot be opened, decompressed, or
|
||||||
|
* read
|
||||||
|
*/
|
||||||
|
public static FrequencyTrie<String> loadBinary(final String fileName, final int maxExpandedIndex) throws IOException {
|
||||||
|
Objects.requireNonNull(fileName, FILENAME_REQUIRED);
|
||||||
|
return StemmerPatchTrieBinaryIO.read(fileName, maxExpandedIndex);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads a GZip-compressed binary patch-command trie from an input stream.
|
* Loads a GZip-compressed binary patch-command trie from an input stream.
|
||||||
*
|
*
|
||||||
@@ -802,7 +845,7 @@ public final class StemmerPatchTrieLoader {
|
|||||||
* read
|
* read
|
||||||
*/
|
*/
|
||||||
public static TrieMetadata loadBinaryMetadata(final Path path) throws IOException {
|
public static TrieMetadata loadBinaryMetadata(final Path path) throws IOException {
|
||||||
Objects.requireNonNull(path, "path");
|
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||||
return StemmerPatchTrieBinaryIO.readMetadata(path);
|
return StemmerPatchTrieBinaryIO.readMetadata(path);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -845,7 +888,7 @@ public final class StemmerPatchTrieLoader {
|
|||||||
*/
|
*/
|
||||||
public static void saveBinary(final FrequencyTrie<String> trie, final Path path) throws IOException {
|
public static void saveBinary(final FrequencyTrie<String> trie, final Path path) throws IOException {
|
||||||
Objects.requireNonNull(trie, "trie");
|
Objects.requireNonNull(trie, "trie");
|
||||||
Objects.requireNonNull(path, "path");
|
Objects.requireNonNull(path, PARAMETER_PATH);
|
||||||
StemmerPatchTrieBinaryIO.write(trie, path);
|
StemmerPatchTrieBinaryIO.write(trie, path);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -43,14 +43,15 @@ import java.util.Objects;
|
|||||||
* immutable from the public API perspective because construction wires these
|
* immutable from the public API perspective because construction wires these
|
||||||
* arrays once and all lookup operations thereafter treat them as read-only.
|
* arrays once and all lookup operations thereafter treat them as read-only.
|
||||||
*
|
*
|
||||||
* @param <V> value type
|
* @param <V> value type
|
||||||
* @param edgeLabels internal edge label array
|
|
||||||
* @param children internal child array
|
|
||||||
* @param orderedValues internal ordered values array
|
|
||||||
* @param orderedCounts internal ordered counts array
|
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("PMD.DataClass")
|
public final class CompiledNode<V> {
|
||||||
public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[] orderedValues, int... orderedCounts) {
|
|
||||||
|
/**
|
||||||
|
* Default dense child lookup span in characters used when an explicit override is
|
||||||
|
* not provided.
|
||||||
|
*/
|
||||||
|
public static final int DEFAULT_MAX_EXPANDED_INDEX = 512;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Number of child edges where linear scan is cheaper than binary search.
|
* Number of child edges where linear scan is cheaper than binary search.
|
||||||
@@ -58,24 +59,112 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
|||||||
private static final int LINEAR_CHILD_COUNT_THRESHOLD = 4;
|
private static final int LINEAR_CHILD_COUNT_THRESHOLD = 4;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates one validated compiled node.
|
* Edge labels in sorted ascending order.
|
||||||
|
*/
|
||||||
|
private final char[] edgeLabels;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sparse child array aligned with {@link #edgeLabels}.
|
||||||
|
*/
|
||||||
|
private final CompiledNode<V>[] children;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dense child lookup table used when labels fit into a compact char interval.
|
||||||
|
* <p>
|
||||||
|
* The table enables direct O(1) indexing for child lookup and is allocated
|
||||||
|
* only when the character span of this node's edges is within the configured
|
||||||
|
* threshold.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
private final CompiledNode<V>[] denseChildren;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalized minimum edge value for the dense lookup table.
|
||||||
|
*/
|
||||||
|
private final int denseEdgeMin;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Values stored at this node in local order.
|
||||||
|
*/
|
||||||
|
private final V[] orderedValues;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Occurrence counts aligned with {@link #orderedValues}.
|
||||||
|
*/
|
||||||
|
private final int[] orderedCounts;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates one validated compiled node using {@link #DEFAULT_MAX_EXPANDED_INDEX}
|
||||||
|
* for dense lookup sizing.
|
||||||
*
|
*
|
||||||
* @throws NullPointerException if any array argument is {@code null}
|
* @throws NullPointerException if any array argument is {@code null}
|
||||||
* @throws IllegalArgumentException if the edge-related arrays or value-related
|
* @throws IllegalArgumentException if the edge-related arrays or value-related
|
||||||
* arrays do not have matching lengths
|
* arrays do not have matching lengths
|
||||||
*/
|
*/
|
||||||
public CompiledNode {
|
public CompiledNode(final char[] edgeLabels, final CompiledNode<V>[] children, final V[] orderedValues,
|
||||||
|
final int... orderedCounts) {
|
||||||
|
this(edgeLabels, children, orderedValues, DEFAULT_MAX_EXPANDED_INDEX, orderedCounts);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates one validated compiled node.
|
||||||
|
*
|
||||||
|
* @param maxExpandedIndex upper bound for the dense lookup interval size; zero
|
||||||
|
* disables dense lookup. Larger values improve
|
||||||
|
* direct-index likelihood while increasing dense
|
||||||
|
* table memory in compact-label nodes.
|
||||||
|
* @throws NullPointerException if any array argument is {@code null}
|
||||||
|
* @throws IllegalArgumentException if the edge-related arrays or value-related
|
||||||
|
* arrays do not have matching lengths or the
|
||||||
|
* dense interval size is negative
|
||||||
|
*/
|
||||||
|
public CompiledNode(final char[] edgeLabels, final CompiledNode<V>[] children, final V[] orderedValues,
|
||||||
|
final int maxExpandedIndex, final int... orderedCounts) {
|
||||||
Objects.requireNonNull(edgeLabels, "edgeLabels");
|
Objects.requireNonNull(edgeLabels, "edgeLabels");
|
||||||
Objects.requireNonNull(children, "children");
|
Objects.requireNonNull(children, "children");
|
||||||
Objects.requireNonNull(orderedValues, "orderedValues");
|
Objects.requireNonNull(orderedValues, "orderedValues");
|
||||||
Objects.requireNonNull(orderedCounts, "orderedCounts");
|
Objects.requireNonNull(orderedCounts, "orderedCounts");
|
||||||
|
|
||||||
|
if (maxExpandedIndex < 0) {
|
||||||
|
throw new IllegalArgumentException("maxExpandedIndex must be non-negative.");
|
||||||
|
}
|
||||||
|
|
||||||
if (edgeLabels.length != children.length) {
|
if (edgeLabels.length != children.length) {
|
||||||
throw new IllegalArgumentException("edgeLabels and children must have the same length.");
|
throw new IllegalArgumentException("edgeLabels and children must have the same length.");
|
||||||
}
|
}
|
||||||
if (orderedValues.length != orderedCounts.length) {
|
if (orderedValues.length != orderedCounts.length) {
|
||||||
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
|
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.edgeLabels = edgeLabels;
|
||||||
|
this.children = children;
|
||||||
|
this.orderedValues = orderedValues;
|
||||||
|
this.orderedCounts = orderedCounts;
|
||||||
|
|
||||||
|
if (edgeLabels.length == 0 || maxExpandedIndex == 0) {
|
||||||
|
this.denseChildren = null;
|
||||||
|
this.denseEdgeMin = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
final int minEdge = edgeLabels[0];
|
||||||
|
final int maxEdge = edgeLabels[edgeLabels.length - 1];
|
||||||
|
final int span = maxEdge - minEdge;
|
||||||
|
|
||||||
|
if (span < 0 || span > maxExpandedIndex) {
|
||||||
|
this.denseChildren = null;
|
||||||
|
this.denseEdgeMin = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
final CompiledNode<V>[] dense = (CompiledNode<V>[]) new CompiledNode[span + 1];
|
||||||
|
for (int edgeIndex = 0; edgeIndex < edgeLabels.length; edgeIndex++) {
|
||||||
|
dense[edgeLabels[edgeIndex] - minEdge] = children[edgeIndex];
|
||||||
|
}
|
||||||
|
|
||||||
|
this.denseChildren = dense;
|
||||||
|
this.denseEdgeMin = minEdge;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -87,7 +176,6 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
|||||||
*
|
*
|
||||||
* @return internal edge-label array
|
* @return internal edge-label array
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||||
public char[] edgeLabels() {
|
public char[] edgeLabels() {
|
||||||
return this.edgeLabels;
|
return this.edgeLabels;
|
||||||
@@ -102,7 +190,6 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
|||||||
*
|
*
|
||||||
* @return internal child-node array
|
* @return internal child-node array
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||||
public CompiledNode<V>[] children() {
|
public CompiledNode<V>[] children() {
|
||||||
return this.children;
|
return this.children;
|
||||||
@@ -117,7 +204,6 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
|||||||
*
|
*
|
||||||
* @return internal ordered-values array
|
* @return internal ordered-values array
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||||
public V[] orderedValues() {
|
public V[] orderedValues() {
|
||||||
return this.orderedValues;
|
return this.orderedValues;
|
||||||
@@ -132,14 +218,143 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
|||||||
*
|
*
|
||||||
* @return internal ordered-counts array
|
* @return internal ordered-counts array
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||||
public int[] orderedCounts() {
|
public int[] orderedCounts() {
|
||||||
return this.orderedCounts;
|
return this.orderedCounts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of child edges represented by this node.
|
||||||
|
*
|
||||||
|
* @return child edge count
|
||||||
|
*/
|
||||||
|
public int edgeCount() {
|
||||||
|
return this.edgeLabels.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of values stored in this node.
|
||||||
|
*
|
||||||
|
* @return value count
|
||||||
|
*/
|
||||||
|
public int valueCount() {
|
||||||
|
return this.orderedValues.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicates whether this node stores any values.
|
||||||
|
*
|
||||||
|
* @return {@code true} when values are present at this node
|
||||||
|
*/
|
||||||
|
public boolean hasValues() {
|
||||||
|
return this.orderedValues.length > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicates whether this node has child edges.
|
||||||
|
*
|
||||||
|
* @return {@code true} when this node has at least one outgoing edge
|
||||||
|
*/
|
||||||
|
public boolean hasChildren() {
|
||||||
|
return this.edgeLabels.length > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicates whether this node has no child edges.
|
||||||
|
*
|
||||||
|
* @return {@code true} when this node is a terminal leaf node
|
||||||
|
*/
|
||||||
|
public boolean isLeaf() {
|
||||||
|
return !hasChildren();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests whether an edge label is present at this node.
|
||||||
|
*
|
||||||
|
* @param edge edge label
|
||||||
|
* @return {@code true} if this node contains the supplied edge label
|
||||||
|
*/
|
||||||
|
public boolean hasEdge(final char edge) {
|
||||||
|
return findChild(edge) != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicates whether this node has a dense direct-index child lookup table.
|
||||||
|
*
|
||||||
|
* @return {@code true} when a direct-index child table is available
|
||||||
|
*/
|
||||||
|
public boolean hasDenseLookup() {
|
||||||
|
return this.denseChildren != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a small memory-related metric describing this node's dense table size.
|
||||||
|
*
|
||||||
|
* @return number of dense table slots, or {@code 0} when dense lookup is not
|
||||||
|
* enabled
|
||||||
|
*/
|
||||||
|
public int denseTableLength() {
|
||||||
|
return this.denseChildren == null ? 0 : this.denseChildren.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a compact structural summary used by diagnostics and tests.
|
||||||
|
*
|
||||||
|
* @return summary hash for node structure and contents
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
int hash = Arrays.hashCode(this.edgeLabels);
|
||||||
|
hash = 31 * hash + Arrays.hashCode(this.children);
|
||||||
|
hash = 31 * hash + Arrays.hashCode(this.orderedValues);
|
||||||
|
hash = 31 * hash + Arrays.hashCode(this.orderedCounts);
|
||||||
|
hash = 31 * hash + Objects.hash(this.denseEdgeMin);
|
||||||
|
hash = 31 * hash + (hasDenseLookup() ? Arrays.hashCode(this.denseChildren) : 0);
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares structural node content, including dense table availability.
|
||||||
|
*
|
||||||
|
* @param object comparison object
|
||||||
|
* @return {@code true} when nodes describe identical structure and payload
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean equals(final Object object) {
|
||||||
|
if (this == object) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (!(object instanceof CompiledNode<?> other)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return Arrays.equals(this.edgeLabels, other.edgeLabels) && Arrays.equals(this.children, other.children)
|
||||||
|
&& Arrays.equals(this.orderedValues, other.orderedValues) && Arrays.equals(this.orderedCounts, other.orderedCounts)
|
||||||
|
&& this.denseEdgeMin == other.denseEdgeMin && Arrays.equals(this.denseChildren, other.denseChildren);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a short summary useful for debugging and diagnostics.
|
||||||
|
*
|
||||||
|
* @return textual node summary
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "CompiledNode{"
|
||||||
|
+ "edgeCount=" + this.edgeLabels.length + ", orderedValueCount=" + this.orderedValues.length
|
||||||
|
+ ", denseTableLength=" + denseTableLength() + '}';
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Finds a child for the supplied edge character.
|
* Finds a child for the supplied edge character.
|
||||||
|
* <p>
|
||||||
|
* Lookup order is:
|
||||||
|
* <ol>
|
||||||
|
* <li>dense array index (if the label interval is compact enough),</li>
|
||||||
|
* <li>small-child linear scan when the fallback node has {@value #LINEAR_CHILD_COUNT_THRESHOLD}
|
||||||
|
* or fewer edges,</li>
|
||||||
|
* <li>binary search over sorted labels.</li>
|
||||||
|
* </ol>
|
||||||
|
* </p>
|
||||||
*
|
*
|
||||||
* @param edge edge character
|
* @param edge edge character
|
||||||
* @return child node, or {@code null} if absent
|
* @return child node, or {@code null} if absent
|
||||||
@@ -149,6 +364,15 @@ public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[]
|
|||||||
if (childCount == 0) {
|
if (childCount == 0) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.denseChildren != null) {
|
||||||
|
final int denseIndex = edge - this.denseEdgeMin;
|
||||||
|
if (denseIndex < 0 || denseIndex >= this.denseChildren.length) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return this.denseChildren[denseIndex];
|
||||||
|
}
|
||||||
|
|
||||||
if (childCount <= LINEAR_CHILD_COUNT_THRESHOLD) {
|
if (childCount <= LINEAR_CHILD_COUNT_THRESHOLD) {
|
||||||
for (int index = 0; index < childCount; index++) {
|
for (int index = 0; index < childCount; index++) {
|
||||||
if (this.edgeLabels[index] == edge) {
|
if (this.edgeLabels[index] == edge) {
|
||||||
|
|||||||
@@ -95,6 +95,8 @@ import org.junit.jupiter.params.provider.MethodSource;
|
|||||||
@Tag("integration")
|
@Tag("integration")
|
||||||
@Tag("cli")
|
@Tag("cli")
|
||||||
@Tag("stemmer")
|
@Tag("stemmer")
|
||||||
|
@Tag("compile")
|
||||||
|
@Tag("slow")
|
||||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||||
@DisplayName("Compile integration")
|
@DisplayName("Compile integration")
|
||||||
final class CompileIntegrationTest {
|
final class CompileIntegrationTest {
|
||||||
@@ -189,9 +191,10 @@ final class CompileIntegrationTest {
|
|||||||
* create nested output directories, preserve expected lookup behavior, and
|
* create nested output directories, preserve expected lookup behavior, and
|
||||||
* store canonical stems when {@code --store-original} is enabled.
|
* store canonical stems when {@code --store-original} is enabled.
|
||||||
*
|
*
|
||||||
* @throws IOException if reading or writing fails
|
* @throws IOException if reading or writing fails
|
||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
|
@Tag("slow")
|
||||||
@DisplayName("CLI should compile the remark-aware fixture and preserve expected lookups")
|
@DisplayName("CLI should compile the remark-aware fixture and preserve expected lookups")
|
||||||
void shouldCompileRemarkAwareFixtureAndPreserveExpectedLookups() throws IOException {
|
void shouldCompileRemarkAwareFixtureAndPreserveExpectedLookups() throws IOException {
|
||||||
final Path inputFile = copyResourceToTemporaryFile(REMARK_AWARE_DICTIONARY_RESOURCE,
|
final Path inputFile = copyResourceToTemporaryFile(REMARK_AWARE_DICTIONARY_RESOURCE,
|
||||||
@@ -234,9 +237,10 @@ final class CompileIntegrationTest {
|
|||||||
* Verifies that the CLI rejects an already existing output path unless
|
* Verifies that the CLI rejects an already existing output path unless
|
||||||
* overwrite is explicitly enabled.
|
* overwrite is explicitly enabled.
|
||||||
*
|
*
|
||||||
* @throws IOException if reading or writing fails
|
* @throws IOException if reading or writing fails
|
||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
|
@Tag("slow")
|
||||||
@DisplayName("CLI should require overwrite before replacing an existing output artifact")
|
@DisplayName("CLI should require overwrite before replacing an existing output artifact")
|
||||||
void shouldRequireOverwriteForExistingOutput() throws IOException {
|
void shouldRequireOverwriteForExistingOutput() throws IOException {
|
||||||
final Path inputFile = copyResourceToTemporaryFile(REMARK_AWARE_DICTIONARY_RESOURCE,
|
final Path inputFile = copyResourceToTemporaryFile(REMARK_AWARE_DICTIONARY_RESOURCE,
|
||||||
@@ -301,6 +305,7 @@ final class CompileIntegrationTest {
|
|||||||
|
|
||||||
@Nested
|
@Nested
|
||||||
@DisplayName("Bundled project dictionary workflows")
|
@DisplayName("Bundled project dictionary workflows")
|
||||||
|
@Tag("slow")
|
||||||
final class BundledProjectDictionaryWorkflows {
|
final class BundledProjectDictionaryWorkflows {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -317,11 +322,12 @@ final class CompileIntegrationTest {
|
|||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* @param scenario scenario identifier
|
* @param scenario scenario identifier
|
||||||
* @param resourcePath bundled dictionary resource path
|
* @param resourcePath bundled dictionary resource path
|
||||||
* @throws IOException if reading or writing fails
|
* @throws IOException if reading or writing fails
|
||||||
*/
|
*/
|
||||||
@ParameterizedTest(name = "[{index}] {0}")
|
@ParameterizedTest(name = "[{index}] {0}")
|
||||||
@MethodSource("org.egothor.stemmer.CompileIntegrationTest#bundledDictionaryCases")
|
@MethodSource("org.egothor.stemmer.CompileIntegrationTest#bundledDictionaryCases")
|
||||||
|
@Tag("slow")
|
||||||
@DisplayName("CLI should compile bundled project dictionaries and preserve representative variant semantics")
|
@DisplayName("CLI should compile bundled project dictionaries and preserve representative variant semantics")
|
||||||
void shouldCompileBundledProjectDictionaryAndPreserveRepresentativeVariantSemantics(final String scenario,
|
void shouldCompileBundledProjectDictionaryAndPreserveRepresentativeVariantSemantics(final String scenario,
|
||||||
final String resourcePath) throws IOException {
|
final String resourcePath) throws IOException {
|
||||||
|
|||||||
@@ -66,7 +66,10 @@ import org.junit.jupiter.api.io.TempDir;
|
|||||||
* {@link System#exit(int)}.
|
* {@link System#exit(int)}.
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("integration")
|
||||||
|
@Tag("cli")
|
||||||
|
@Tag("compile")
|
||||||
|
@Tag("stemmer")
|
||||||
@DisplayName("Compile")
|
@DisplayName("Compile")
|
||||||
class CompileTest {
|
class CompileTest {
|
||||||
|
|
||||||
|
|||||||
@@ -70,10 +70,11 @@ import org.junit.jupiter.params.provider.MethodSource;
|
|||||||
* <li>compressed artifact reproducibility within the active format version</li>
|
* <li>compressed artifact reproducibility within the active format version</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("compat")
|
||||||
@Tag("regression")
|
@Tag("regression")
|
||||||
@Tag("determinism")
|
@Tag("determinism")
|
||||||
@Tag("serialization")
|
@Tag("serialization")
|
||||||
|
@Tag("trie")
|
||||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||||
final class CompiledTrieArtifactRegressionTest {
|
final class CompiledTrieArtifactRegressionTest {
|
||||||
|
|
||||||
|
|||||||
@@ -41,7 +41,8 @@ import org.junit.jupiter.api.Test;
|
|||||||
* Unit tests for {@link DiacriticStripper}.
|
* Unit tests for {@link DiacriticStripper}.
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("diacritics")
|
@Tag("diacritic")
|
||||||
|
@Tag("stemmer")
|
||||||
@DisplayName("DiacriticStripper")
|
@DisplayName("DiacriticStripper")
|
||||||
class DiacriticStripperTest {
|
class DiacriticStripperTest {
|
||||||
|
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
*/
|
*/
|
||||||
@DisplayName("FrequencyTrieBuilders")
|
@DisplayName("FrequencyTrieBuilders")
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("builder")
|
@Tag("construction")
|
||||||
@Tag("frequency-trie")
|
@Tag("frequency-trie")
|
||||||
class FrequencyTrieBuildersTest {
|
class FrequencyTrieBuildersTest {
|
||||||
|
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ import java.util.List;
|
|||||||
import net.jqwik.api.ForAll;
|
import net.jqwik.api.ForAll;
|
||||||
import net.jqwik.api.Label;
|
import net.jqwik.api.Label;
|
||||||
import net.jqwik.api.Property;
|
import net.jqwik.api.Property;
|
||||||
import net.jqwik.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Property-based tests for the compiled trie abstraction.
|
* Property-based tests for the compiled trie abstraction.
|
||||||
@@ -59,9 +59,9 @@ import net.jqwik.api.Tag;
|
|||||||
* core algorithm without overfitting to particular fixture data.
|
* core algorithm without overfitting to particular fixture data.
|
||||||
*/
|
*/
|
||||||
@Label("FrequencyTrie properties")
|
@Label("FrequencyTrie properties")
|
||||||
@Tag("unit")
|
|
||||||
@Tag("property")
|
@Tag("property")
|
||||||
@Tag("trie")
|
@Tag("trie")
|
||||||
|
@Tag("frequency-trie")
|
||||||
class FrequencyTrieProperties extends PropertyBasedTestSupport {
|
class FrequencyTrieProperties extends PropertyBasedTestSupport {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ package org.egothor.stemmer;
|
|||||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||||
@@ -379,6 +380,24 @@ class FrequencyTrieTest {
|
|||||||
assertThrows(UnsupportedOperationException.class, () -> entries.add(new ValueCount<String>("z", 1)));
|
assertThrows(UnsupportedOperationException.class, () -> entries.add(new ValueCount<String>("z", 1)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that {@link FrequencyTrie#getEntries(String)} short-circuits to a one-item immutable list.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("getEntries returns a one-item list for single stored values")
|
||||||
|
void getEntriesReturnsSingleItemListForSingleStoredValue() {
|
||||||
|
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||||
|
|
||||||
|
builder.put("gamma", "only");
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = builder.build();
|
||||||
|
|
||||||
|
final List<ValueCount<String>> entries = trie.getEntries("gamma");
|
||||||
|
|
||||||
|
assertAll(() -> assertEquals(List.of(new ValueCount<String>("only", 1)), entries),
|
||||||
|
() -> assertThrows(UnsupportedOperationException.class, () -> entries.add(new ValueCount<String>("z", 1))));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that equal frequencies prefer the shorter string representation.
|
* Verifies that equal frequencies prefer the shorter string representation.
|
||||||
*/
|
*/
|
||||||
@@ -755,6 +774,115 @@ class FrequencyTrieTest {
|
|||||||
.readFrom(new ByteArrayInputStream(serializedEmptyTrie), String[]::new, null)));
|
.readFrom(new ByteArrayInputStream(serializedEmptyTrie), String[]::new, null)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that reading a compiled trie with a negative max-expanded override
|
||||||
|
* smaller than -1 is rejected.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom rejects invalid maxExpandedIndex override")
|
||||||
|
void readFromRejectsInvalidMaxExpandedIndexOverride() {
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||||
|
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC, -2));
|
||||||
|
|
||||||
|
assertEquals("maxExpandedIndex must be >= -1.", exception.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that the max-expanded override controls dense lookup materialization
|
||||||
|
* while preserving lookup semantics.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom respects dense lookup max-expanded index override")
|
||||||
|
void readFromRespectsDenseLookupMaxExpandedIndexOverride() throws IOException {
|
||||||
|
final FrequencyTrie.Builder<String> builder = rankedBuilder();
|
||||||
|
|
||||||
|
builder.put("a", "a");
|
||||||
|
builder.put("b", "b");
|
||||||
|
builder.put("c", "c");
|
||||||
|
builder.put("d", "d");
|
||||||
|
|
||||||
|
final FrequencyTrie<String> original = builder.build();
|
||||||
|
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||||
|
original.writeTo(outputStream, STRING_CODEC);
|
||||||
|
final byte[] serializedTrie = outputStream.toByteArray();
|
||||||
|
|
||||||
|
final FrequencyTrie<String> defaultDense = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie), String[]::new,
|
||||||
|
STRING_CODEC);
|
||||||
|
final FrequencyTrie<String> defaultDenseByNegative = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie),
|
||||||
|
String[]::new, STRING_CODEC, -1);
|
||||||
|
final FrequencyTrie<String> disabledDense = FrequencyTrie.readFrom(new ByteArrayInputStream(serializedTrie), String[]::new,
|
||||||
|
STRING_CODEC, 0);
|
||||||
|
|
||||||
|
assertAll(
|
||||||
|
() -> assertTrue(defaultDense.root().hasDenseLookup(),
|
||||||
|
"Default read should enable dense lookup for compact first-level edges."),
|
||||||
|
() -> assertTrue(defaultDenseByNegative.root().hasDenseLookup(),
|
||||||
|
"Negative override should use the default dense lookup span."),
|
||||||
|
() -> assertFalse(disabledDense.root().hasDenseLookup(),
|
||||||
|
"Zero override should disable dense lookup tables."),
|
||||||
|
() -> assertEquals(original.get("a"), disabledDense.get("a")),
|
||||||
|
() -> assertEquals(original.get("b"), disabledDense.get("b")),
|
||||||
|
() -> assertEquals(original.get("c"), disabledDense.get("c")),
|
||||||
|
() -> assertEquals(original.get("d"), disabledDense.get("d")),
|
||||||
|
() -> assertEquals(original.get("z"), disabledDense.get("z")));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that cyclic serialized node references are rejected as invalid
|
||||||
|
* serialization.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom rejects cyclic serialized node references")
|
||||||
|
void readFromRejectsCyclicSerializedNodeReferences() {
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 1, 2, 0, new NodeWriter[] {
|
||||||
|
dataOutput -> {
|
||||||
|
dataOutput.writeInt(1);
|
||||||
|
dataOutput.writeChar('b');
|
||||||
|
dataOutput.writeInt(1);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
},
|
||||||
|
dataOutput -> {
|
||||||
|
dataOutput.writeInt(1);
|
||||||
|
dataOutput.writeChar('a');
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final IOException exception = assertThrows(IOException.class,
|
||||||
|
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||||
|
|
||||||
|
assertTrue(exception.getMessage().contains("cyclic reference detected"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that child node references outside the valid serialized range are
|
||||||
|
* rejected.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom rejects invalid child node identifiers")
|
||||||
|
void readFromRejectsInvalidChildNodeId() {
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(1);
|
||||||
|
dataOutput.writeChar('a');
|
||||||
|
dataOutput.writeInt(3);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final IOException exception = assertThrows(IOException.class,
|
||||||
|
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||||
|
|
||||||
|
assertTrue(exception.getMessage().contains("Invalid child node id"));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that deserialization rejects an invalid stream magic header.
|
* Verifies that deserialization rejects an invalid stream magic header.
|
||||||
*/
|
*/
|
||||||
@@ -785,6 +913,27 @@ class FrequencyTrieTest {
|
|||||||
assertTrue(exception.getMessage().contains("Unsupported trie stream version"));
|
assertTrue(exception.getMessage().contains("Unsupported trie stream version"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that the latest stream version validates textual metadata blocks.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom rejects invalid textual metadata block")
|
||||||
|
void readFromRejectsInvalidTextualMetadataBlock() {
|
||||||
|
final int version = FrequencyTrie.currentFormatVersion();
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, version, 1, 0, dataOutput -> {
|
||||||
|
dataOutput.writeUTF("not valid metadata");
|
||||||
|
}, new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final IOException exception = assertThrows(IOException.class,
|
||||||
|
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||||
|
|
||||||
|
assertTrue(exception.getMessage().contains("Invalid metadata block"));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that deserialization rejects a negative node count.
|
* Verifies that deserialization rejects a negative node count.
|
||||||
*/
|
*/
|
||||||
@@ -862,6 +1011,129 @@ class FrequencyTrieTest {
|
|||||||
assertTrue(exception.getMessage().contains("Non-positive stored count"));
|
assertTrue(exception.getMessage().contains("Non-positive stored count"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that legacy version 1 metadata uses compatibility defaults.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom supports legacy version 1 metadata")
|
||||||
|
void readFromSupportsLegacyVersionOneMetadata() throws IOException {
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
|
||||||
|
|
||||||
|
assertEquals(TrieMetadata.legacy(1, WordTraversalDirection.BACKWARD), trie.metadata());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that legacy version 2 metadata stores traversal direction and uses
|
||||||
|
* compatibility defaults for other values.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom supports legacy version 2 metadata")
|
||||||
|
void readFromSupportsLegacyVersionTwoMetadata() throws IOException {
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 2, 1, 0,
|
||||||
|
dataOutput -> dataOutput.writeInt(WordTraversalDirection.FORWARD.ordinal()), new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
|
||||||
|
|
||||||
|
assertEquals(TrieMetadata.legacy(2, WordTraversalDirection.FORWARD), trie.metadata());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that version 3 metadata includes reduction and diacritic
|
||||||
|
* processing settings.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom parses version 3 metadata")
|
||||||
|
void readFromParsesVersionThreeMetadata() throws IOException {
|
||||||
|
final ReductionSettings reductionSettings = new ReductionSettings(
|
||||||
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS, 81, 4);
|
||||||
|
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 3, 1, 0,
|
||||||
|
dataOutput -> {
|
||||||
|
dataOutput.writeInt(WordTraversalDirection.BACKWARD.ordinal());
|
||||||
|
dataOutput.writeInt(reductionSettings.reductionMode().ordinal());
|
||||||
|
dataOutput.writeInt(reductionSettings.dominantWinnerMinPercent());
|
||||||
|
dataOutput.writeInt(reductionSettings.dominantWinnerOverSecondRatio());
|
||||||
|
dataOutput.writeInt(DiacriticProcessingMode.REMOVE.ordinal());
|
||||||
|
},
|
||||||
|
new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
|
||||||
|
final TrieMetadata metadata = trie.metadata();
|
||||||
|
|
||||||
|
assertAll(() -> assertEquals(3, metadata.formatVersion()),
|
||||||
|
() -> assertEquals(WordTraversalDirection.BACKWARD, metadata.traversalDirection()),
|
||||||
|
() -> assertEquals(reductionSettings, metadata.reductionSettings()),
|
||||||
|
() -> assertEquals(DiacriticProcessingMode.REMOVE, metadata.diacriticProcessingMode()),
|
||||||
|
() -> assertEquals(CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, metadata.caseProcessingMode()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that version 4 metadata additionally stores case-processing mode.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom parses version 4 case processing metadata")
|
||||||
|
void readFromParsesVersionFourCaseMetadata() throws IOException {
|
||||||
|
final ReductionSettings reductionSettings = new ReductionSettings(
|
||||||
|
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS, 75, 3);
|
||||||
|
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 4, 1, 0,
|
||||||
|
dataOutput -> {
|
||||||
|
dataOutput.writeInt(WordTraversalDirection.FORWARD.ordinal());
|
||||||
|
dataOutput.writeInt(reductionSettings.reductionMode().ordinal());
|
||||||
|
dataOutput.writeInt(reductionSettings.dominantWinnerMinPercent());
|
||||||
|
dataOutput.writeInt(reductionSettings.dominantWinnerOverSecondRatio());
|
||||||
|
dataOutput.writeInt(DiacriticProcessingMode.AS_IS.ordinal());
|
||||||
|
dataOutput.writeInt(CaseProcessingMode.AS_IS.ordinal());
|
||||||
|
},
|
||||||
|
new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC);
|
||||||
|
final TrieMetadata metadata = trie.metadata();
|
||||||
|
|
||||||
|
assertAll(() -> assertEquals(4, metadata.formatVersion()),
|
||||||
|
() -> assertEquals(WordTraversalDirection.FORWARD, metadata.traversalDirection()),
|
||||||
|
() -> assertEquals(reductionSettings, metadata.reductionSettings()),
|
||||||
|
() -> assertEquals(DiacriticProcessingMode.AS_IS, metadata.diacriticProcessingMode()),
|
||||||
|
() -> assertEquals(CaseProcessingMode.AS_IS, metadata.caseProcessingMode()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that invalid legacy metadata ordinals are rejected by validation.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@Tag("persistence")
|
||||||
|
@DisplayName("readFrom rejects invalid metadata ordinal in legacy stream")
|
||||||
|
void readFromRejectsInvalidLegacyMetadataOrdinal() {
|
||||||
|
final byte[] bytes = createSerializedStream(0x45475452, 2, 1, 0,
|
||||||
|
dataOutput -> dataOutput.writeInt(999), new NodeWriter[] { dataOutput -> {
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
dataOutput.writeInt(0);
|
||||||
|
} });
|
||||||
|
|
||||||
|
final IOException exception = assertThrows(IOException.class,
|
||||||
|
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||||
|
|
||||||
|
assertTrue(exception.getMessage().contains("Invalid traversal direction ordinal"));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Writes one node body into a synthetic serialized trie stream.
|
* Writes one node body into a synthetic serialized trie stream.
|
||||||
*/
|
*/
|
||||||
@@ -889,6 +1161,24 @@ class FrequencyTrieTest {
|
|||||||
*/
|
*/
|
||||||
private static byte[] createSerializedStream(final int magic, final int version, final int nodeCount,
|
private static byte[] createSerializedStream(final int magic, final int version, final int nodeCount,
|
||||||
final int rootNodeId, final NodeWriter[] nodes) {
|
final int rootNodeId, final NodeWriter[] nodes) {
|
||||||
|
return createSerializedStream(magic, version, nodeCount, rootNodeId, dataOutput -> {
|
||||||
|
// legacy and text-based versions write their metadata differently.
|
||||||
|
}, nodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes a synthetic serialized trie stream with a metadata writer hook.
|
||||||
|
*
|
||||||
|
* @param magic stream magic
|
||||||
|
* @param version stream version
|
||||||
|
* @param nodeCount declared node count
|
||||||
|
* @param rootNodeId declared root node identifier
|
||||||
|
* @param metadata version-specific metadata writer
|
||||||
|
* @param nodes node body writers
|
||||||
|
* @return serialized bytes
|
||||||
|
*/
|
||||||
|
private static byte[] createSerializedStream(final int magic, final int version, final int nodeCount,
|
||||||
|
final int rootNodeId, final MetadataWriter metadata, final NodeWriter[] nodes) {
|
||||||
try {
|
try {
|
||||||
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||||
final DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream);
|
final DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream);
|
||||||
@@ -897,6 +1187,7 @@ class FrequencyTrieTest {
|
|||||||
dataOutputStream.writeInt(version);
|
dataOutputStream.writeInt(version);
|
||||||
dataOutputStream.writeInt(nodeCount);
|
dataOutputStream.writeInt(nodeCount);
|
||||||
dataOutputStream.writeInt(rootNodeId);
|
dataOutputStream.writeInt(rootNodeId);
|
||||||
|
metadata.write(dataOutputStream);
|
||||||
|
|
||||||
for (NodeWriter node : nodes) {
|
for (NodeWriter node : nodes) {
|
||||||
node.write(dataOutputStream);
|
node.write(dataOutputStream);
|
||||||
@@ -908,4 +1199,19 @@ class FrequencyTrieTest {
|
|||||||
throw new IllegalStateException("Unexpected I/O while building synthetic trie stream.", exception);
|
throw new IllegalStateException("Unexpected I/O while building synthetic trie stream.", exception);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes one synthetic metadata block.
|
||||||
|
*/
|
||||||
|
@FunctionalInterface
|
||||||
|
private interface MetadataWriter {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes metadata bytes for one stream version.
|
||||||
|
*
|
||||||
|
* @param dataOutput output stream
|
||||||
|
* @throws IOException if writing fails
|
||||||
|
*/
|
||||||
|
void write(DataOutputStream dataOutput) throws IOException;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -65,10 +65,9 @@ import org.junit.jupiter.api.io.TempDir;
|
|||||||
* stems declared by the source dictionary.
|
* stems declared by the source dictionary.
|
||||||
*/
|
*/
|
||||||
@DisplayName("Deterministic fuzz-style trie and stemmer compilation")
|
@DisplayName("Deterministic fuzz-style trie and stemmer compilation")
|
||||||
@Tag("unit")
|
|
||||||
@Tag("fuzz")
|
@Tag("fuzz")
|
||||||
@Tag("trie")
|
@Tag("trie")
|
||||||
@Tag("stemming")
|
@Tag("stemmer")
|
||||||
class FuzzStemmerAndTrieCompilationTest {
|
class FuzzStemmerAndTrieCompilationTest {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ import static org.junit.jupiter.api.Assertions.assertNotNull;
|
|||||||
import net.jqwik.api.ForAll;
|
import net.jqwik.api.ForAll;
|
||||||
import net.jqwik.api.Label;
|
import net.jqwik.api.Label;
|
||||||
import net.jqwik.api.Property;
|
import net.jqwik.api.Property;
|
||||||
import net.jqwik.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Property-based tests for {@link PatchCommandEncoder}.
|
* Property-based tests for {@link PatchCommandEncoder}.
|
||||||
@@ -47,9 +47,9 @@ import net.jqwik.api.Tag;
|
|||||||
* reconstruct the exact requested target.
|
* reconstruct the exact requested target.
|
||||||
*/
|
*/
|
||||||
@Label("PatchCommandEncoder properties")
|
@Label("PatchCommandEncoder properties")
|
||||||
@Tag("unit")
|
|
||||||
@Tag("property")
|
@Tag("property")
|
||||||
@Tag("patch")
|
@Tag("patch")
|
||||||
|
@Tag("stemmer")
|
||||||
class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -241,7 +241,7 @@ class PatchCommandEncoderTest {
|
|||||||
*/
|
*/
|
||||||
@Nested
|
@Nested
|
||||||
@DisplayName("construction")
|
@DisplayName("construction")
|
||||||
@Tag("constructor")
|
@Tag("construction")
|
||||||
class ConstructionTests {
|
class ConstructionTests {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -326,7 +326,7 @@ class PatchCommandEncoderTest {
|
|||||||
*/
|
*/
|
||||||
@Nested
|
@Nested
|
||||||
@DisplayName("encode(String, String)")
|
@DisplayName("encode(String, String)")
|
||||||
@Tag("encode")
|
@Tag("encoding")
|
||||||
class EncodeTests {
|
class EncodeTests {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -658,7 +658,7 @@ class PatchCommandEncoderTest {
|
|||||||
*/
|
*/
|
||||||
@Nested
|
@Nested
|
||||||
@DisplayName("reversed-word processing")
|
@DisplayName("reversed-word processing")
|
||||||
@Tag("reverse")
|
@Tag("normalization")
|
||||||
class ReversedWordProcessingTests {
|
class ReversedWordProcessingTests {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -75,6 +75,7 @@ import org.junit.jupiter.api.io.TempDir;
|
|||||||
@DisplayName("StemmerDictionaryParser")
|
@DisplayName("StemmerDictionaryParser")
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("parser")
|
@Tag("parser")
|
||||||
|
@Tag("stemmer")
|
||||||
class StemmerDictionaryParserTest {
|
class StemmerDictionaryParserTest {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -54,9 +54,9 @@ import org.junit.jupiter.api.io.TempDir;
|
|||||||
/**
|
/**
|
||||||
* Tests for {@link StemmerKnowledgeExperiment}.
|
* Tests for {@link StemmerKnowledgeExperiment}.
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
|
||||||
@Tag("integration")
|
@Tag("integration")
|
||||||
@Tag("stemmer")
|
@Tag("stemmer")
|
||||||
|
@Tag("trie")
|
||||||
final class StemmerKnowledgeExperimentTest {
|
final class StemmerKnowledgeExperimentTest {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -38,6 +38,8 @@ import static org.junit.jupiter.api.Assertions.assertSame;
|
|||||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
import static org.mockito.ArgumentMatchers.any;
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
|
import static org.mockito.ArgumentMatchers.anyInt;
|
||||||
|
import static org.mockito.ArgumentMatchers.eq;
|
||||||
import static org.mockito.Mockito.mock;
|
import static org.mockito.Mockito.mock;
|
||||||
import static org.mockito.Mockito.mockStatic;
|
import static org.mockito.Mockito.mockStatic;
|
||||||
import static org.mockito.Mockito.verify;
|
import static org.mockito.Mockito.verify;
|
||||||
@@ -91,6 +93,8 @@ import org.mockito.MockedStatic;
|
|||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("io")
|
@Tag("io")
|
||||||
@Tag("persistence")
|
@Tag("persistence")
|
||||||
|
@Tag("serialization")
|
||||||
|
@Tag("trie")
|
||||||
@DisplayName("StemmerPatchTrieBinaryIO")
|
@DisplayName("StemmerPatchTrieBinaryIO")
|
||||||
class StemmerPatchTrieBinaryIOTest {
|
class StemmerPatchTrieBinaryIOTest {
|
||||||
|
|
||||||
@@ -299,9 +303,19 @@ class StemmerPatchTrieBinaryIOTest {
|
|||||||
"read(Path) must reject null path."),
|
"read(Path) must reject null path."),
|
||||||
() -> assertThrows(NullPointerException.class, () -> StemmerPatchTrieBinaryIO.read((String) null),
|
() -> assertThrows(NullPointerException.class, () -> StemmerPatchTrieBinaryIO.read((String) null),
|
||||||
"read(String) must reject null file name."),
|
"read(String) must reject null file name."),
|
||||||
|
() -> assertThrows(NullPointerException.class,
|
||||||
|
() -> StemmerPatchTrieBinaryIO.read((Path) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||||
|
"read(Path, int) must reject null path."),
|
||||||
|
() -> assertThrows(NullPointerException.class,
|
||||||
|
() -> StemmerPatchTrieBinaryIO.read((String) null,
|
||||||
|
FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||||
|
"read(String, int) must reject null file name."),
|
||||||
() -> assertThrows(NullPointerException.class,
|
() -> assertThrows(NullPointerException.class,
|
||||||
() -> StemmerPatchTrieBinaryIO.read((ByteArrayInputStream) null),
|
() -> StemmerPatchTrieBinaryIO.read((ByteArrayInputStream) null),
|
||||||
"read(InputStream) must reject null input stream."));
|
"read(InputStream) must reject null input stream."),
|
||||||
|
() -> assertThrows(NullPointerException.class,
|
||||||
|
() -> StemmerPatchTrieBinaryIO.read((ByteArrayInputStream) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||||
|
"read(InputStream, int) must reject null input stream."));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -385,6 +399,143 @@ class StemmerPatchTrieBinaryIOTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that stream overload with dense span override delegates to the
|
||||||
|
* four-argument readFrom method.
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should delegate stream read with dense span override")
|
||||||
|
void shouldDelegateInputStreamReadWithDenseSpanOverride() throws IOException {
|
||||||
|
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
|
||||||
|
final byte[] gzipPayload = gzip("binary-content-with-max-expanded-index");
|
||||||
|
|
||||||
|
try (@SuppressWarnings("rawtypes")
|
||||||
|
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
|
||||||
|
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||||
|
any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
|
||||||
|
|
||||||
|
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO
|
||||||
|
.read(new ByteArrayInputStream(gzipPayload), 17);
|
||||||
|
|
||||||
|
assertSame(expectedTrie, actualTrie,
|
||||||
|
"read(InputStream, int) must return the trie produced by FrequencyTrie.readFrom(...).");
|
||||||
|
|
||||||
|
mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||||
|
any(FrequencyTrie.ValueStreamCodec.class), eq(17)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that path overload with dense span override delegates to the
|
||||||
|
* same method overload with the override parameter.
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should delegate path read with dense span override")
|
||||||
|
void shouldDelegatePathReadWithDenseSpanOverride() throws IOException {
|
||||||
|
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
|
||||||
|
final Path sourceFile = temporaryDirectory.resolve("input-max-expanded.bin.gz");
|
||||||
|
Files.write(sourceFile, gzip("path-based-max-expanded-index"));
|
||||||
|
|
||||||
|
try (@SuppressWarnings("rawtypes")
|
||||||
|
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
|
||||||
|
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||||
|
any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
|
||||||
|
|
||||||
|
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO.read(sourceFile, 0);
|
||||||
|
|
||||||
|
assertSame(expectedTrie, actualTrie,
|
||||||
|
"read(Path, int) must return the trie produced by FrequencyTrie.readFrom(...).");
|
||||||
|
|
||||||
|
mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||||
|
any(FrequencyTrie.ValueStreamCodec.class), eq(0)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that string path overload with dense span override delegates to the
|
||||||
|
* same method overload with the override parameter.
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should delegate file name read with dense span override")
|
||||||
|
void shouldDelegateStringReadWithDenseSpanOverride() throws IOException {
|
||||||
|
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
|
||||||
|
final Path sourceFile = temporaryDirectory.resolve("input-string-max-expanded.bin.gz");
|
||||||
|
Files.write(sourceFile, gzip("string-based-max-expanded-index"));
|
||||||
|
|
||||||
|
try (@SuppressWarnings("rawtypes")
|
||||||
|
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
|
||||||
|
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||||
|
any(FrequencyTrie.ValueStreamCodec.class), anyInt())).thenReturn(expectedTrie);
|
||||||
|
|
||||||
|
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO.read(sourceFile.toString(), 32);
|
||||||
|
|
||||||
|
assertSame(expectedTrie, actualTrie,
|
||||||
|
"read(String, int) must return the trie produced by FrequencyTrie.readFrom(...).");
|
||||||
|
|
||||||
|
mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
|
||||||
|
any(FrequencyTrie.ValueStreamCodec.class), eq(32)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that metadata-only read parses and returns the persisted metadata.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should read metadata from gzip payload")
|
||||||
|
void shouldReadMetadataFromGzipPayload() throws IOException {
|
||||||
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new,
|
||||||
|
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||||
|
builder.put("run", PatchCommandEncoder.builder().build().encode("running", "run"));
|
||||||
|
final FrequencyTrie<String> trie = builder.build();
|
||||||
|
|
||||||
|
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||||
|
StemmerPatchTrieBinaryIO.write(trie, outputStream);
|
||||||
|
|
||||||
|
final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(new ByteArrayInputStream(outputStream.toByteArray()));
|
||||||
|
|
||||||
|
assertEquals(trie.metadata(), metadata,
|
||||||
|
"readMetadata(InputStream) must return the same metadata persisted by write().");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that metadata can be read from a binary file path.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should read metadata from file path")
|
||||||
|
void shouldReadMetadataFromPath() throws IOException {
|
||||||
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new,
|
||||||
|
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||||
|
builder.put("city", PatchCommandEncoder.builder().build().encode("cities", "city"));
|
||||||
|
final FrequencyTrie<String> trie = builder.build();
|
||||||
|
|
||||||
|
final Path sourceFile = temporaryDirectory.resolve("metadata-path.bin.gz");
|
||||||
|
StemmerPatchTrieBinaryIO.write(trie, sourceFile);
|
||||||
|
|
||||||
|
final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(sourceFile);
|
||||||
|
assertEquals(trie.metadata(), metadata);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that metadata can be read from a binary file name.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Should read metadata from file name")
|
||||||
|
void shouldReadMetadataFromStringPath() throws IOException {
|
||||||
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new,
|
||||||
|
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
|
||||||
|
builder.put("city", PatchCommandEncoder.builder().build().encode("cities", "city"));
|
||||||
|
final FrequencyTrie<String> trie = builder.build();
|
||||||
|
|
||||||
|
final Path sourceFile = temporaryDirectory.resolve("metadata-string.bin.gz");
|
||||||
|
StemmerPatchTrieBinaryIO.write(trie, sourceFile);
|
||||||
|
|
||||||
|
final TrieMetadata metadata = StemmerPatchTrieBinaryIO.readMetadata(sourceFile.toString());
|
||||||
|
assertEquals(trie.metadata(), metadata);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that malformed non-GZip input is reported as an I/O failure.
|
* Verifies that malformed non-GZip input is reported as an I/O failure.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -85,9 +85,10 @@ import org.junit.jupiter.params.provider.MethodSource;
|
|||||||
* <li>the current bundled language set, including right-to-left metadata</li>
|
* <li>the current bundled language set, including right-to-left metadata</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
|
||||||
@Tag("integration")
|
@Tag("integration")
|
||||||
@Tag("stemmer")
|
@Tag("stemmer")
|
||||||
|
@Tag("io")
|
||||||
|
@Tag("parser")
|
||||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||||
final class StemmerPatchTrieLoaderTest {
|
final class StemmerPatchTrieLoaderTest {
|
||||||
|
|
||||||
@@ -210,36 +211,43 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
Arguments.of("14-load-binary-string",
|
Arguments.of("14-load-binary-string",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null),
|
||||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||||
Arguments.of("15-load-binary-stream",
|
Arguments.of("15-load-binary-path-override",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((Path) null, FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||||
|
"path"),
|
||||||
|
Arguments.of("16-load-binary-string-override",
|
||||||
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null,
|
||||||
|
FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX),
|
||||||
|
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||||
|
Arguments.of("17-load-binary-stream",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((InputStream) null),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((InputStream) null),
|
||||||
"inputStream"),
|
"inputStream"),
|
||||||
Arguments.of("16-save-binary-null-trie-path",
|
Arguments.of("18-save-binary-null-trie-path",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath()), "trie"),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath()), "trie"),
|
||||||
Arguments.of("17-save-binary-null-path",
|
Arguments.of("19-save-binary-null-path",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (Path) null), "path"),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (Path) null), "path"),
|
||||||
Arguments.of("18-save-binary-null-trie-string",
|
Arguments.of("20-save-binary-null-trie-string",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath().toString()),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath().toString()),
|
||||||
"trie"),
|
"trie"),
|
||||||
Arguments.of("19-save-binary-null-string",
|
Arguments.of("21-save-binary-null-string",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
|
||||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||||
Arguments.of("20-load-language-null-metadata",
|
Arguments.of("22-load-language-null-metadata",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
|
||||||
true, (TrieMetadata) null),
|
true, (TrieMetadata) null),
|
||||||
"metadata"),
|
"metadata"),
|
||||||
Arguments.of("21-load-path-null-metadata",
|
Arguments.of("23-load-path-null-metadata",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (TrieMetadata) null),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (TrieMetadata) null),
|
||||||
"metadata"),
|
"metadata"),
|
||||||
Arguments.of("22-load-string-null-metadata",
|
Arguments.of("24-load-string-null-metadata",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
|
||||||
(TrieMetadata) null),
|
(TrieMetadata) null),
|
||||||
"metadata"),
|
"metadata"),
|
||||||
Arguments.of("23-load-binary-metadata-path-null",
|
Arguments.of("25-load-binary-metadata-path-null",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((Path) null), "path"),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((Path) null), "path"),
|
||||||
Arguments.of("24-load-binary-metadata-string-null",
|
Arguments.of("26-load-binary-metadata-string-null",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((String) null),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((String) null),
|
||||||
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
StemmerPatchTrieLoader.FILENAME_REQUIRED),
|
||||||
Arguments.of("25-load-binary-metadata-stream-null",
|
Arguments.of("27-load-binary-metadata-stream-null",
|
||||||
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((InputStream) null),
|
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinaryMetadata((InputStream) null),
|
||||||
"inputStream"));
|
"inputStream"));
|
||||||
}
|
}
|
||||||
@@ -512,6 +520,44 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that binary load overloads with an explicit dense lookup span
|
||||||
|
* preserve trie semantics while honoring the dense-layout override.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Binary dense-span override overloads should load equivalent tries")
|
||||||
|
void shouldLoadBinaryWithDenseSpanOverrideOverloads() throws IOException {
|
||||||
|
final Path dictionaryFile = writeDictionary("""
|
||||||
|
run running runs runner
|
||||||
|
city cities
|
||||||
|
study studies studying
|
||||||
|
""");
|
||||||
|
final Path binaryFile = tempDir.resolve("stemmer-trie-overrides.bin.gz");
|
||||||
|
|
||||||
|
final FrequencyTrie<String> original = StemmerPatchTrieLoader.load(dictionaryFile, true,
|
||||||
|
DEFAULT_REDUCTION_MODE);
|
||||||
|
|
||||||
|
StemmerPatchTrieLoader.saveBinary(original, binaryFile);
|
||||||
|
|
||||||
|
final FrequencyTrie<String> fromPathDefault = StemmerPatchTrieLoader.loadBinary(binaryFile);
|
||||||
|
final FrequencyTrie<String> fromPathDefaultByNegative = StemmerPatchTrieLoader.loadBinary(binaryFile,
|
||||||
|
FrequencyTrie.DEFAULT_MAX_EXPANDED_INDEX);
|
||||||
|
final FrequencyTrie<String> fromPathNoDense = StemmerPatchTrieLoader.loadBinary(binaryFile, 0);
|
||||||
|
final FrequencyTrie<String> fromStringNoDense = StemmerPatchTrieLoader.loadBinary(binaryFile.toString(), 0);
|
||||||
|
|
||||||
|
assertTriePatchSemanticsEqual(original, fromPathDefault, "run", "running", "runner", "cities", "studying");
|
||||||
|
assertTriePatchSemanticsEqual(original, fromPathDefaultByNegative, "run", "running", "runner", "cities",
|
||||||
|
"studying");
|
||||||
|
assertTriePatchSemanticsEqual(original, fromPathNoDense, "run", "running", "runner", "cities", "studying");
|
||||||
|
assertTriePatchSemanticsEqual(original, fromStringNoDense, "run", "running", "runner", "cities",
|
||||||
|
"studying");
|
||||||
|
|
||||||
|
assertFalse(fromPathNoDense.root().hasDenseLookup(),
|
||||||
|
"Zero span should disable dense lookup on the loaded root.");
|
||||||
|
assertFalse(fromStringNoDense.root().hasDenseLookup(),
|
||||||
|
"Zero span should disable dense lookup on the loaded root.");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Writes a dictionary file into the temporary directory.
|
* Writes a dictionary file into the temporary directory.
|
||||||
*
|
*
|
||||||
@@ -530,6 +576,7 @@ final class StemmerPatchTrieLoaderTest {
|
|||||||
* Bundled dictionary integration tests.
|
* Bundled dictionary integration tests.
|
||||||
*/
|
*/
|
||||||
@Nested
|
@Nested
|
||||||
|
@Tag("slow")
|
||||||
@DisplayName("Bundled dictionaries")
|
@DisplayName("Bundled dictionaries")
|
||||||
final class BundledDictionaryTests {
|
final class BundledDictionaryTests {
|
||||||
|
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ import java.util.Set;
|
|||||||
import net.jqwik.api.ForAll;
|
import net.jqwik.api.ForAll;
|
||||||
import net.jqwik.api.Label;
|
import net.jqwik.api.Label;
|
||||||
import net.jqwik.api.Property;
|
import net.jqwik.api.Property;
|
||||||
import net.jqwik.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Property-based tests for patch-command stemmer tries.
|
* Property-based tests for patch-command stemmer tries.
|
||||||
@@ -56,9 +56,8 @@ import net.jqwik.api.Tag;
|
|||||||
* persistence must not alter that behavior.
|
* persistence must not alter that behavior.
|
||||||
*/
|
*/
|
||||||
@Label("Stemmer patch trie properties")
|
@Label("Stemmer patch trie properties")
|
||||||
@Tag("unit")
|
|
||||||
@Tag("property")
|
@Tag("property")
|
||||||
@Tag("stemming")
|
@Tag("stemmer")
|
||||||
class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
|
class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -40,6 +40,8 @@ import org.junit.jupiter.api.Tag;
|
|||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
|
@Tag("metadata")
|
||||||
|
@Tag("trie")
|
||||||
@DisplayName("TrieMetadata")
|
@DisplayName("TrieMetadata")
|
||||||
class TrieMetadataTest {
|
class TrieMetadataTest {
|
||||||
|
|
||||||
|
|||||||
@@ -40,6 +40,8 @@ import org.junit.jupiter.api.Tag;
|
|||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
|
@Tag("core")
|
||||||
|
@Tag("stemmer")
|
||||||
@DisplayName("WordTraversalDirection")
|
@DisplayName("WordTraversalDirection")
|
||||||
class WordTraversalDirectionTest {
|
class WordTraversalDirectionTest {
|
||||||
|
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
* Unit tests for {@link ChildDescriptor}.
|
* Unit tests for {@link ChildDescriptor}.
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("fast")
|
@Tag("trie")
|
||||||
@DisplayName("ChildDescriptor")
|
@DisplayName("ChildDescriptor")
|
||||||
class ChildDescriptorTest {
|
class ChildDescriptorTest {
|
||||||
|
|
||||||
|
|||||||
@@ -31,8 +31,10 @@
|
|||||||
package org.egothor.stemmer.trie;
|
package org.egothor.stemmer.trie;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
import org.junit.jupiter.api.DisplayName;
|
import org.junit.jupiter.api.DisplayName;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
@@ -43,7 +45,6 @@ import org.junit.jupiter.api.Test;
|
|||||||
* documented backing-array exposure.
|
* documented backing-array exposure.
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("fast")
|
|
||||||
@Tag("trie")
|
@Tag("trie")
|
||||||
@DisplayName("CompiledNode and NodeData")
|
@DisplayName("CompiledNode and NodeData")
|
||||||
class CompiledNodeAndNodeDataTest {
|
class CompiledNodeAndNodeDataTest {
|
||||||
@@ -141,4 +142,136 @@ class CompiledNodeAndNodeDataTest {
|
|||||||
assertSame(orderedValues, node.orderedValues());
|
assertSame(orderedValues, node.orderedValues());
|
||||||
assertSame(orderedCounts, node.orderedCounts());
|
assertSame(orderedCounts, node.orderedCounts());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that dense lookup is used when the interval is compact.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("CompiledNode can resolve child via dense lookup table")
|
||||||
|
void compiledNodeUsesDenseLookupForCompactIntervals() {
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
final CompiledNode<String>[] children = new CompiledNode[4];
|
||||||
|
children[0] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||||
|
children[1] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||||
|
children[2] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||||
|
children[3] = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||||
|
|
||||||
|
final CompiledNode<String> node = new CompiledNode<>(new char[] { 'a', 'b', 'c', 'd' }, children,
|
||||||
|
new String[] { "1", "2", "3", "4" }, new int[] { 1, 1, 1, 1 });
|
||||||
|
|
||||||
|
assertTrue(node.hasDenseLookup());
|
||||||
|
|
||||||
|
assertSame(children[0], node.findChild('a'));
|
||||||
|
assertSame(children[3], node.findChild('d'));
|
||||||
|
assertSame(null, node.findChild('z'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that fallback linear scan is used for small node degree.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("CompiledNode resolves child by linear scan for small degree")
|
||||||
|
void compiledNodeUsesLinearScanForSmallDegree() {
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
final CompiledNode<String>[] children = new CompiledNode[4];
|
||||||
|
final CompiledNode<String> childA = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||||
|
final CompiledNode<String> childB = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||||
|
final CompiledNode<String> childC = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||||
|
final CompiledNode<String> childD = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||||
|
children[0] = childA;
|
||||||
|
children[1] = childB;
|
||||||
|
children[2] = childC;
|
||||||
|
children[3] = childD;
|
||||||
|
|
||||||
|
final CompiledNode<String> node = new CompiledNode<>(new char[] { 'a', 'z', '中', '你' }, children,
|
||||||
|
new String[] { "1", "2", "3", "4" }, 0, new int[] { 1, 1, 1, 1 });
|
||||||
|
|
||||||
|
assertFalse(node.hasDenseLookup());
|
||||||
|
|
||||||
|
assertSame(childA, node.findChild('a'));
|
||||||
|
assertSame(childD, node.findChild('你'));
|
||||||
|
assertSame(null, node.findChild('b'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that fallback binary search is used for larger node degree without
|
||||||
|
* dense lookup.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("CompiledNode resolves child by binary search for large degree")
|
||||||
|
void compiledNodeUsesBinarySearchForLargeDegree() {
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
final CompiledNode<String>[] children = new CompiledNode[5];
|
||||||
|
final CompiledNode<String> childA = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||||
|
final CompiledNode<String> childB = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||||
|
final CompiledNode<String> childC = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||||
|
final CompiledNode<String> childD = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||||
|
final CompiledNode<String> childE = new CompiledNode<>(new char[0], new CompiledNode[0], new String[0], new int[0]);
|
||||||
|
children[0] = childA;
|
||||||
|
children[1] = childB;
|
||||||
|
children[2] = childC;
|
||||||
|
children[3] = childD;
|
||||||
|
children[4] = childE;
|
||||||
|
|
||||||
|
final CompiledNode<String> node = new CompiledNode<>(new char[] { 'a', 'c', 'k', 't', 'z' }, children,
|
||||||
|
new String[] { "1", "2", "3", "4", "5" }, 0, new int[] { 1, 1, 1, 1, 1 });
|
||||||
|
|
||||||
|
assertFalse(node.hasDenseLookup());
|
||||||
|
|
||||||
|
assertSame(childC, node.findChild('k'));
|
||||||
|
assertSame(childE, node.findChild('z'));
|
||||||
|
assertSame(null, node.findChild('x'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies the basic node-state helpers that are used by diagnostics and
|
||||||
|
* behavioral checks.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("CompiledNode reports leaf, value and edge presence state")
|
||||||
|
void compiledNodeReportsNodeStateHelpers() {
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
final CompiledNode<String>[] childless = new CompiledNode[0];
|
||||||
|
final CompiledNode<String> leaf = new CompiledNode<>(new char[0], childless, new String[0], new int[0]);
|
||||||
|
|
||||||
|
assertTrue(leaf.isLeaf());
|
||||||
|
assertFalse(leaf.hasChildren());
|
||||||
|
assertFalse(leaf.hasValues());
|
||||||
|
assertFalse(leaf.hasEdge('a'));
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
final CompiledNode<String>[] child = new CompiledNode[1];
|
||||||
|
final String[] orderedValues = new String[] { "leaf" };
|
||||||
|
final int[] orderedCounts = new int[] { 1 };
|
||||||
|
child[0] = new CompiledNode<>(new char[0], new CompiledNode[0], orderedValues, orderedCounts);
|
||||||
|
final CompiledNode<String> node = new CompiledNode<>(new char[] { 'a' }, child, orderedValues, orderedCounts);
|
||||||
|
|
||||||
|
assertFalse(node.isLeaf());
|
||||||
|
assertTrue(node.hasChildren());
|
||||||
|
assertTrue(node.hasValues());
|
||||||
|
assertTrue(node.valueCount() > 0);
|
||||||
|
assertTrue(node.hasEdge('a'));
|
||||||
|
assertFalse(node.hasEdge('b'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies structural equality and hash-code behavior for compiled nodes.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("CompiledNode equals and hashCode align for identical structure")
|
||||||
|
void compiledNodeEqualsAndHashCodeAlignForIdenticalStructure() {
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
final CompiledNode<String>[] child = new CompiledNode[1];
|
||||||
|
final CompiledNode<String> leaf = new CompiledNode<>(new char[0], new CompiledNode[0], new String[] { "v" },
|
||||||
|
new int[] { 1 });
|
||||||
|
child[0] = leaf;
|
||||||
|
|
||||||
|
final CompiledNode<String> first = new CompiledNode<>(new char[] { 'a' }, child, new String[] { "x" },
|
||||||
|
new int[] { 2 });
|
||||||
|
final CompiledNode<String> second = new CompiledNode<>(new char[] { 'a' }, child, new String[] { "x" },
|
||||||
|
new int[] { 2 });
|
||||||
|
|
||||||
|
assertEquals(first, second);
|
||||||
|
assertEquals(first.hashCode(), second.hashCode());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
* Unit tests for {@link DominantLocalDescriptor}.
|
* Unit tests for {@link DominantLocalDescriptor}.
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("fast")
|
@Tag("trie")
|
||||||
@DisplayName("DominantLocalDescriptor")
|
@DisplayName("DominantLocalDescriptor")
|
||||||
class DominantLocalDescriptorTest {
|
class DominantLocalDescriptorTest {
|
||||||
|
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
* Unit tests for {@link LocalValueSummary}.
|
* Unit tests for {@link LocalValueSummary}.
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("fast")
|
@Tag("trie")
|
||||||
@DisplayName("LocalValueSummary")
|
@DisplayName("LocalValueSummary")
|
||||||
class LocalValueSummaryTest {
|
class LocalValueSummaryTest {
|
||||||
|
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
* Unit tests for {@link MutableNode}.
|
* Unit tests for {@link MutableNode}.
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("fast")
|
@Tag("trie")
|
||||||
@DisplayName("MutableNode")
|
@DisplayName("MutableNode")
|
||||||
class MutableNodeTest {
|
class MutableNodeTest {
|
||||||
|
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
* Unit tests for {@link RankedLocalDescriptor}.
|
* Unit tests for {@link RankedLocalDescriptor}.
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("fast")
|
@Tag("trie")
|
||||||
@DisplayName("RankedLocalDescriptor")
|
@DisplayName("RankedLocalDescriptor")
|
||||||
class RankedLocalDescriptorTest {
|
class RankedLocalDescriptorTest {
|
||||||
|
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
* Unit tests for {@link ReducedNode}.
|
* Unit tests for {@link ReducedNode}.
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("fast")
|
@Tag("trie")
|
||||||
@DisplayName("ReducedNode")
|
@DisplayName("ReducedNode")
|
||||||
class ReducedNodeTest {
|
class ReducedNodeTest {
|
||||||
|
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
* Unit tests for {@link ReductionContext}.
|
* Unit tests for {@link ReductionContext}.
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("fast")
|
@Tag("trie")
|
||||||
@DisplayName("ReductionContext")
|
@DisplayName("ReductionContext")
|
||||||
class ReductionContextTest {
|
class ReductionContextTest {
|
||||||
|
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
* Unit tests for {@link ReductionSignature}.
|
* Unit tests for {@link ReductionSignature}.
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("fast")
|
@Tag("trie")
|
||||||
@DisplayName("ReductionSignature")
|
@DisplayName("ReductionSignature")
|
||||||
class ReductionSignatureTest {
|
class ReductionSignatureTest {
|
||||||
|
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
* Unit tests for {@link UnorderedLocalDescriptor}.
|
* Unit tests for {@link UnorderedLocalDescriptor}.
|
||||||
*/
|
*/
|
||||||
@Tag("unit")
|
@Tag("unit")
|
||||||
@Tag("fast")
|
@Tag("trie")
|
||||||
@DisplayName("UnorderedLocalDescriptor")
|
@DisplayName("UnorderedLocalDescriptor")
|
||||||
class UnorderedLocalDescriptorTest {
|
class UnorderedLocalDescriptorTest {
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user