6 Commits

Author SHA1 Message Date
7e1aea72bf refactor: apply minor Radixor refinements and refresh dependency locks 2026-04-16 21:31:01 +02:00
594abe2c4b feat: add jqwik property-based coverage for trie and patch invariants
test: add property-based tests for FrequencyTrie determinism across repeated compilation
test: verify semantic alignment of get(), getAll(), and getEntries()
test: verify binary serialization and compressed persistence round-trip stability
test: verify builder reconstruction preserves observable trie behavior
test: add property-based tests for PatchCommandEncoder encode/apply round-trip and determinism
test: add generated stemmer-trie properties ensuring returned patches reconstruct only acceptable stems
test: introduce bounded reusable jqwik generators and scenario builders for maintainable property coverage
build: add jqwik to test dependencies and integrate it with the existing JUnit Platform setup
test: replace Jupiter display and tag annotations in jqwik suites with jqwik-native metadata to remove discovery warnings
2026-04-16 19:40:29 +02:00
953ce2226a feat(test): add deterministic fuzz-style coverage for trie compilation and stemming
* add fixed-seed fuzz scenario generator for bounded trie and dictionary inputs
* validate compilation stability across repeated builds and binary round-trips
* validate generated stemming dictionaries for non-crashing compilation and acceptable stem reconstruction
* add CI-safe semantic invariants for reduced trie reconstruction using get() and getAll()
* avoid unstable count-preservation assertions for builder reconstruction from reduced shared tries
2026-04-16 18:51:39 +02:00
05692726c5 feat: publish Pages-backed quality badges in README
* add README badges for CI status, coverage, reports, mutation score, benchmark speedup, Maven Central, license, and Java baseline
* generate Shields endpoint metadata for JaCoCo, PIT, and JMH results
* move badge generation logic into tools/generate-pages-badges.py to keep workflows concise and maintainable
* update Pages publishing workflow to publish badge metadata for both build-specific and latest report views
* expose published badge metadata links in the reports index for transparency and troubleshooting
2026-04-16 18:22:24 +02:00
c18563617d feat: add release changelog generation and package distribution integration
feat: add custom release changelog generator based on release tag ranges and prefixed commit lines
build: include generated CHANGELOG.md in the distribution ZIP when present
ci: generate release changelog during release workflow and use it as the GitHub release body
ci: split release packaging so distZip is rebuilt after changelog generation
chore: keep changelog generation out of quality-gate and report publishing workflows
2026-04-16 17:42:22 +02:00
436deefd14 fix: exclude Maven metadata files from Central upload bundle
fix: remove maven-metadata files from the generated Central bundle
fix: align uploaded archive with Sonatype Portal component layout expectations
2026-04-16 03:42:59 +02:00
25 changed files with 2402 additions and 21 deletions

View File

@@ -3,20 +3,20 @@
<classpathentry kind="src" output="bin/main" path="src/main/java"> <classpathentry kind="src" output="bin/main" path="src/main/java">
<attributes> <attributes>
<attribute name="gradle_scope" value="main"/> <attribute name="gradle_scope" value="main"/>
<attribute name="gradle_used_by_scope" value="main,test,jmh"/> <attribute name="gradle_used_by_scope" value="main,test"/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry kind="src" output="bin/test" path="src/test/java"> <classpathentry kind="src" output="bin/test" path="src/test/java">
<attributes> <attributes>
<attribute name="gradle_scope" value="test"/> <attribute name="gradle_scope" value="test"/>
<attribute name="gradle_used_by_scope" value="test,jmh"/> <attribute name="gradle_used_by_scope" value="test"/>
<attribute name="test" value="true"/> <attribute name="test" value="true"/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry kind="src" output="bin/main" path="src/main/resources"> <classpathentry kind="src" output="bin/main" path="src/main/resources">
<attributes> <attributes>
<attribute name="gradle_scope" value="main"/> <attribute name="gradle_scope" value="main"/>
<attribute name="gradle_used_by_scope" value="main,test,jmh"/> <attribute name="gradle_used_by_scope" value="main,test"/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry kind="src" output="bin/jmh" path="src/jmh/java"> <classpathentry kind="src" output="bin/jmh" path="src/jmh/java">
@@ -36,7 +36,7 @@
<classpathentry kind="src" output="bin/test" path="src/test/resources"> <classpathentry kind="src" output="bin/test" path="src/test/resources">
<attributes> <attributes>
<attribute name="gradle_scope" value="test"/> <attribute name="gradle_scope" value="test"/>
<attribute name="gradle_used_by_scope" value="test,jmh"/> <attribute name="gradle_used_by_scope" value="test"/>
<attribute name="test" value="true"/> <attribute name="test" value="true"/>
</attributes> </attributes>
</classpathentry> </classpathentry>

View File

@@ -156,11 +156,31 @@ jobs:
test -f gradle.properties test -f gradle.properties
test -f gradle/verification-metadata.xml test -f gradle/verification-metadata.xml
- name: Build release distribution, signed Maven bundle, and SBOM - name: Generate release changelog for tagged builds
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/release@')
shell: bash
run: |
set -euo pipefail
chmod +x ./tools/generate-release-notes.sh
mkdir -p build/generated/release-notes
./tools/generate-release-notes.sh "${GITHUB_REF_NAME}" > build/generated/release-notes/CHANGELOG.md
- name: Build release inputs, signed Maven bundle, and SBOM
env: env:
SIGNING_KEY: ${{ secrets.SIGNING_KEY }} SIGNING_KEY: ${{ secrets.SIGNING_KEY }}
SIGNING_PASSWORD: ${{ secrets.SIGNING_PASSWORD }} SIGNING_PASSWORD: ${{ secrets.SIGNING_PASSWORD }}
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport distZip cyclonedxBom centralBundle run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport cyclonedxBom centralBundle
- name: Generate release changelog
shell: bash
run: |
set -euo pipefail
chmod +x ./tools/generate-release-notes.sh
mkdir -p build/generated/release-notes
./tools/generate-release-notes.sh "${GITHUB_REF_NAME}" > build/generated/release-notes/CHANGELOG.md
- name: Package release distribution
run: ./gradlew --no-daemon distZip
- name: Publish bundle to Maven Central - name: Publish bundle to Maven Central
shell: bash shell: bash
@@ -188,7 +208,7 @@ jobs:
- name: Publish GitHub release assets - name: Publish GitHub release assets
uses: softprops/action-gh-release@v2 uses: softprops/action-gh-release@v2
with: with:
generate_release_notes: true body_path: build/generated/release-notes/CHANGELOG.md
files: | files: |
build/distributions/*.zip build/distributions/*.zip
build/reports/sbom/radixor-sbom.json build/reports/sbom/radixor-sbom.json

View File

@@ -17,6 +17,7 @@ on:
- 'gradlew' - 'gradlew'
- 'gradlew.bat' - 'gradlew.bat'
- '.github/workflows/pages.yml' - '.github/workflows/pages.yml'
- 'tools/generate-pages-badges.py'
workflow_dispatch: workflow_dispatch:
permissions: permissions:
@@ -83,11 +84,13 @@ jobs:
SITE_DIR=".gh-pages" SITE_DIR=".gh-pages"
RUN_DIR="${SITE_DIR}/builds/${GITHUB_RUN_NUMBER}" RUN_DIR="${SITE_DIR}/builds/${GITHUB_RUN_NUMBER}"
RUN_METRICS_DIR="${RUN_DIR}/metrics"
LATEST_DIR="${SITE_DIR}/builds/latest" LATEST_DIR="${SITE_DIR}/builds/latest"
LATEST_METRICS_DIR="${LATEST_DIR}/metrics"
mkdir -p "${RUN_DIR}" mkdir -p "${RUN_DIR}"
rm -rf "${LATEST_DIR}" rm -rf "${LATEST_DIR}"
mkdir -p "${LATEST_DIR}" mkdir -p "${LATEST_DIR}" "${RUN_METRICS_DIR}" "${LATEST_METRICS_DIR}"
cp -R build/docs/javadoc "${RUN_DIR}/javadoc" cp -R build/docs/javadoc "${RUN_DIR}/javadoc"
cp -R build/docs/javadoc "${LATEST_DIR}/javadoc" cp -R build/docs/javadoc "${LATEST_DIR}/javadoc"
@@ -152,6 +155,26 @@ jobs:
SBOM_XML_LATEST_LINK='<li><a href="./builds/latest/sbom/radixor-sbom.xml">SBOM (XML)</a></li>' SBOM_XML_LATEST_LINK='<li><a href="./builds/latest/sbom/radixor-sbom.xml">SBOM (XML)</a></li>'
fi fi
python3 \
./tools/generate-pages-badges.py \
--jacoco-xml build/reports/jacoco/test/jacocoTestReport.xml \
--pit-xml build/reports/pitest/mutations.xml \
--jmh-csv build/reports/jmh/jmh-results.csv \
--run-metrics-dir "${RUN_METRICS_DIR}" \
--latest-metrics-dir "${LATEST_METRICS_DIR}"
COVERAGE_BADGE_LINK='<li><a href="./metrics/coverage-badge.json">Coverage Badge Metadata</a></li>'
COVERAGE_BADGE_LATEST_LINK='<li><a href="./builds/latest/metrics/coverage-badge.json">Coverage Badge Metadata</a></li>'
MUTATION_BADGE_LINK='<li><a href="./metrics/pitest-badge.json">Mutation Badge Metadata</a></li>'
MUTATION_BADGE_LATEST_LINK='<li><a href="./builds/latest/metrics/pitest-badge.json">Mutation Badge Metadata</a></li>'
JMH_BADGE_LINK='<li><a href="./metrics/jmh-badge.json">Benchmark Badge Metadata</a></li>'
JMH_BADGE_LATEST_LINK='<li><a href="./builds/latest/metrics/jmh-badge.json">Benchmark Badge Metadata</a></li>'
if [ ! -f "${RUN_METRICS_DIR}/coverage-badge.json" ]; then
COVERAGE_BADGE_LINK='<li>Coverage Badge Metadata: not available</li>'
COVERAGE_BADGE_LATEST_LINK='<li>Coverage Badge Metadata: not available</li>'
fi
cat > "${RUN_DIR}/index.html" <<EOF cat > "${RUN_DIR}/index.html" <<EOF
<!doctype html> <!doctype html>
<html lang="en"> <html lang="en">
@@ -178,6 +201,9 @@ jobs:
${DEPENDENCY_CHECK_LINK:-<li>Dependency Vulnerability Report: not available</li>} ${DEPENDENCY_CHECK_LINK:-<li>Dependency Vulnerability Report: not available</li>}
${SBOM_JSON_LINK:-<li>SBOM (JSON): not available</li>} ${SBOM_JSON_LINK:-<li>SBOM (JSON): not available</li>}
${SBOM_XML_LINK:-<li>SBOM (XML): not available</li>} ${SBOM_XML_LINK:-<li>SBOM (XML): not available</li>}
${COVERAGE_BADGE_LINK}
${MUTATION_BADGE_LINK}
${JMH_BADGE_LINK}
<li><a href="./pitest/">Mutation Testing Report</a></li> <li><a href="./pitest/">Mutation Testing Report</a></li>
$( $(
[ "${HAS_JMH}" = "true" ] && { echo "${JMH_TXT_LINK:-<li>Benchmark Results (TXT): not available</li>}"; echo "${JMH_CSV_LINK:-<li>Benchmark Results (CSV): not available</li>}"; } \ [ "${HAS_JMH}" = "true" ] && { echo "${JMH_TXT_LINK:-<li>Benchmark Results (TXT): not available</li>}"; echo "${JMH_CSV_LINK:-<li>Benchmark Results (CSV): not available</li>}"; } \
@@ -227,6 +253,9 @@ jobs:
${DEPENDENCY_CHECK_LATEST_LINK:-<li>Dependency Vulnerability Report: not currently available</li>} ${DEPENDENCY_CHECK_LATEST_LINK:-<li>Dependency Vulnerability Report: not currently available</li>}
${SBOM_JSON_LATEST_LINK:-<li>SBOM (JSON): not available</li>} ${SBOM_JSON_LATEST_LINK:-<li>SBOM (JSON): not available</li>}
${SBOM_XML_LATEST_LINK:-<li>SBOM (XML): not available</li>} ${SBOM_XML_LATEST_LINK:-<li>SBOM (XML): not available</li>}
${COVERAGE_BADGE_LATEST_LINK}
${MUTATION_BADGE_LATEST_LINK}
${JMH_BADGE_LATEST_LINK}
<li><a href="./builds/latest/pitest/">Mutation Testing Report</a></li> <li><a href="./builds/latest/pitest/">Mutation Testing Report</a></li>
$( $(
[ "${HAS_JMH}" = "true" ] && { echo "${JMH_TXT_LATEST_LINK:-<li>Benchmark Results (TXT): not available</li>}"; echo "${JMH_CSV_LATEST_LINK:-<li>Benchmark Results (CSV): not available</li>}"; } \ [ "${HAS_JMH}" = "true" ] && { echo "${JMH_TXT_LATEST_LINK:-<li>Benchmark Results (TXT): not available</li>}"; echo "${JMH_CSV_LATEST_LINK:-<li>Benchmark Results (CSV): not available</li>}"; } \

3
.gitignore vendored
View File

@@ -90,6 +90,9 @@ local.properties
# PMD plugin conf # PMD plugin conf
.pmd .pmd
# jqwik local db
.jqwik-database
##---------------------------------------------------------------------------------------- Gradle ##---------------------------------------------------------------------------------------- Gradle
.gradle .gradle
**/build/ **/build/

View File

@@ -2,6 +2,15 @@
# Radixor # Radixor
[![Quality gates](https://github.com/leogalambos/Radixor/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/leogalambos/Radixor/actions/workflows/build.yml)
[![Coverage](https://img.shields.io/endpoint?url=https://leogalambos.github.io/Radixor/builds/latest/metrics/coverage-badge.json)](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
[![Published reports](https://img.shields.io/badge/reports-GitHub%20Pages-blue)](https://leogalambos.github.io/Radixor/builds/latest/)
[![Mutation score](https://img.shields.io/endpoint?url=https://leogalambos.github.io/Radixor/builds/latest/metrics/pitest-badge.json)](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
[![English benchmark](https://img.shields.io/endpoint?url=https://leogalambos.github.io/Radixor/builds/latest/metrics/jmh-badge.json)](https://leogalambos.github.io/Radixor/builds/latest/jmh/jmh-results.txt)
[![Maven Central](https://img.shields.io/maven-central/v/org.egothor/radixor)](https://central.sonatype.com/artifact/org.egothor/radixor)
[![License](https://img.shields.io/github/license/leogalambos/Radixor)](LICENSE)
[![Java](https://img.shields.io/badge/Java-21%2B-brightgreen)](#)
*Fast algorithmic stemming with compact patch-command tries — measured at about 4× to 6× the throughput of the Snowball Porter stemmer family on the current English benchmark workload.* *Fast algorithmic stemming with compact patch-command tries — measured at about 4× to 6× the throughput of the Snowball Porter stemmer family on the current English benchmark workload.*
**Radixor** is a fast, algorithmic stemming toolkit for Java, built around compact **patch-command tries** in the tradition of the original **Egothor** stemmer. **Radixor** is a fast, algorithmic stemming toolkit for Java, built around compact **patch-command tries** in the tradition of the original **Egothor** stemmer.

View File

@@ -70,6 +70,7 @@ dependencies {
testImplementation libs.mockito.core testImplementation libs.mockito.core
testImplementation libs.mockito.junit.jupiter testImplementation libs.mockito.junit.jupiter
testImplementation libs.jqwik
mockitoAgent(libs.mockito.core) { mockitoAgent(libs.mockito.core) {
transitive = false transitive = false
@@ -187,6 +188,54 @@ pitest {
application { application {
mainClass = 'org.egothor.stemmer.Compile' mainClass = 'org.egothor.stemmer.Compile'
applicationName = 'radixor'
executableDir = 'bin'
}
distributions {
main {
distributionBaseName = 'radixor'
contents {
from('README.md') {
into ''
}
from('LICENSE') {
into ''
}
from('docs') {
into 'docs'
include 'quick-start.md'
include 'cli-compilation.md'
include 'dictionary-format.md'
include 'built-in-languages.md'
include 'programmatic-usage.md'
include 'architecture-and-reduction.md'
include 'quality-and-operations.md'
include 'benchmarking.md'
}
from(layout.buildDirectory.dir('generated/release-notes')) {
into ''
include 'CHANGELOG.md'
}
}
}
}
tasks.named('startScripts') {
applicationName = 'radixor'
}
tasks.named('distZip', Zip) {
archiveBaseName = 'radixor'
archiveClassifier = 'bin'
}
tasks.named('distTar') {
enabled = false
} }
jmh { jmh {

View File

@@ -7,6 +7,11 @@ com.google.code.gson:gson:2.13.2=pmd
com.google.errorprone:error_prone_annotations:2.41.0=pmd com.google.errorprone:error_prone_annotations:2.41.0=pmd
net.bytebuddy:byte-buddy-agent:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath net.bytebuddy:byte-buddy-agent:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.bytebuddy:byte-buddy:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath net.bytebuddy:byte-buddy:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.jqwik:jqwik-api:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.jqwik:jqwik-engine:1.9.3=jmhRuntimeClasspath,testRuntimeClasspath
net.jqwik:jqwik-time:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.jqwik:jqwik-web:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.jqwik:jqwik:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
net.sf.jopt-simple:jopt-simple:4.9=pitest net.sf.jopt-simple:jopt-simple:4.9=pitest
net.sf.jopt-simple:jopt-simple:5.0.4=jmh,jmhCompileClasspath,jmhRuntimeClasspath net.sf.jopt-simple:jopt-simple:5.0.4=jmh,jmhCompileClasspath,jmhRuntimeClasspath
net.sf.saxon:Saxon-HE:12.9=pmd net.sf.saxon:Saxon-HE:12.9=pmd
@@ -19,7 +24,7 @@ org.apache.commons:commons-lang3:3.18.0=pitest
org.apache.commons:commons-lang3:3.20.0=pmd org.apache.commons:commons-lang3:3.20.0=pmd
org.apache.commons:commons-math3:3.6.1=jmh,jmhCompileClasspath,jmhRuntimeClasspath org.apache.commons:commons-math3:3.6.1=jmh,jmhCompileClasspath,jmhRuntimeClasspath
org.apache.commons:commons-text:1.14.0=pitest org.apache.commons:commons-text:1.14.0=pitest
org.apiguardian:apiguardian-api:1.1.2=testCompileClasspath org.apiguardian:apiguardian-api:1.1.2=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
org.checkerframework:checker-qual:3.52.1=pmd org.checkerframework:checker-qual:3.52.1=pmd
org.jacoco:org.jacoco.agent:0.8.14=jacocoAgent,jacocoAnt org.jacoco:org.jacoco.agent:0.8.14=jacocoAgent,jacocoAnt
org.jacoco:org.jacoco.ant:0.8.14=jacocoAnt org.jacoco:org.jacoco.ant:0.8.14=jacocoAnt

View File

@@ -1,12 +1,14 @@
# #
# After changing dependency versions: # After changing dependency versions:
# #
# unlock temporarily: LockMode.STRICT -> LockMode.LENIENT
#
# refresh verification metadata:
# ./gradlew --write-verification-metadata sha256 test jmh distZip cyclonedxBom
#
# run: # run:
# ./gradlew --write-locks classes testClasses jmh distZip cyclonedxBom # ./gradlew --write-locks classes testClasses jmh distZip cyclonedxBom
# #
# if needed, refresh verification metadata:
# ./gradlew --write-verification-metadata sha256 test jmh distZip cyclonedxBom
#
# (optional - for Eclipse IDE) # (optional - for Eclipse IDE)
# insert trusted-artifacts into gradle/verification-metadata.xml/verification-metadata/configuration: # insert trusted-artifacts into gradle/verification-metadata.xml/verification-metadata/configuration:
# <trusted-artifacts> # <trusted-artifacts>
@@ -21,6 +23,7 @@
[versions] [versions]
junit = "5.14.3" junit = "5.14.3"
mockito = "5.23.0" mockito = "5.23.0"
jqwik = "1.9.3"
[libraries] [libraries]
junit-bom = { module = "org.junit:junit-bom", version.ref = "junit" } junit-bom = { module = "org.junit:junit-bom", version.ref = "junit" }
@@ -29,3 +32,5 @@ junit-platform-launcher = { module = "org.junit.platform:junit-platform-launcher
mockito-core = { module = "org.mockito:mockito-core", version.ref = "mockito" } mockito-core = { module = "org.mockito:mockito-core", version.ref = "mockito" }
mockito-junit-jupiter = { module = "org.mockito:mockito-junit-jupiter", version.ref = "mockito" } mockito-junit-jupiter = { module = "org.mockito:mockito-junit-jupiter", version.ref = "mockito" }
jqwik = { module = "net.jqwik:jqwik", version.ref = "jqwik" }

View File

@@ -131,7 +131,15 @@ tasks.register('centralBundle', Zip) {
dependsOn(tasks.named('createCentralChecksums')) dependsOn(tasks.named('createCentralChecksums'))
from(centralStagingRepositoryDirectory) from(centralStagingRepositoryDirectory) {
exclude '**/maven-metadata*.xml'
exclude '**/maven-metadata*.xml.md5'
exclude '**/maven-metadata*.xml.sha1'
exclude '**/maven-metadata*.xml.asc'
exclude '**/maven-metadata*.xml.asc.md5'
exclude '**/maven-metadata*.xml.asc.sha1'
}
destinationDirectory = centralBundleDirectory destinationDirectory = centralBundleDirectory
archiveFileName = "radixor-${project.version}-central-bundle.zip" archiveFileName = "radixor-${project.version}-central-bundle.zip"
} }

View File

@@ -568,6 +568,46 @@
<sha256 value="1af699f8d9ddab67f9a0d202fbd7915eb0362a5a6dfd5ffc54cafa3465c9cb0a" origin="Generated by Gradle"/> <sha256 value="1af699f8d9ddab67f9a0d202fbd7915eb0362a5a6dfd5ffc54cafa3465c9cb0a" origin="Generated by Gradle"/>
</artifact> </artifact>
</component> </component>
<component group="net.jqwik" name="jqwik" version="1.9.3">
<artifact name="jqwik-1.9.3.jar">
<sha256 value="562931e1667308180056a8ce85791f71ab8c37ca8efc2006a163ba5d650e5f73" origin="Generated by Gradle"/>
</artifact>
<artifact name="jqwik-1.9.3.module">
<sha256 value="681316f856db4ea3cac8fcced811127fc1d7016875e5b50aa4a55024513a93d7" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="net.jqwik" name="jqwik-api" version="1.9.3">
<artifact name="jqwik-api-1.9.3.jar">
<sha256 value="4bce7e80beb6d9d7092a799fa8a509d76cc31dbb20c938a9952965c15d1dd9b2" origin="Generated by Gradle"/>
</artifact>
<artifact name="jqwik-api-1.9.3.module">
<sha256 value="69984416ea2e9f7fde40cfac983d2f540d3a37e9766fd3b0a06fada8f9b4cff2" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="net.jqwik" name="jqwik-engine" version="1.9.3">
<artifact name="jqwik-engine-1.9.3.jar">
<sha256 value="b85592ee78e30239ccfdca7a134f918ee94ebec51ad29a313fc9a676d97b3ede" origin="Generated by Gradle"/>
</artifact>
<artifact name="jqwik-engine-1.9.3.module">
<sha256 value="2c68479ebda9e334bc9033abd2ef227353808f20114f197947b5c7b9646ab8e5" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="net.jqwik" name="jqwik-time" version="1.9.3">
<artifact name="jqwik-time-1.9.3.jar">
<sha256 value="9fd09021d8f03d44990457bf3095cf0aaf34d2785d1108ff22590286c233b3e5" origin="Generated by Gradle"/>
</artifact>
<artifact name="jqwik-time-1.9.3.module">
<sha256 value="c2b056576c8767bfcd7efdd982890fbc71e608fb5c9c80fc145cfee6adeeaa24" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="net.jqwik" name="jqwik-web" version="1.9.3">
<artifact name="jqwik-web-1.9.3.jar">
<sha256 value="6aee9d583c1ff9efe319b2fa0bc9d75fc616de6d1f240ddbd2af9eabda483dbe" origin="Generated by Gradle"/>
</artifact>
<artifact name="jqwik-web-1.9.3.module">
<sha256 value="38c86130c8b86c1657b4f8256e065ee08551f7c5ce728d1a5be8f63133b14554" origin="Generated by Gradle"/>
</artifact>
</component>
<component group="net.sf.jopt-simple" name="jopt-simple" version="4.9"> <component group="net.sf.jopt-simple" name="jopt-simple" version="4.9">
<artifact name="jopt-simple-4.9.jar"> <artifact name="jopt-simple-4.9.jar">
<sha256 value="26c5856e954b5f864db76f13b86919b59c6eecf9fd930b96baa8884626baf2f5" origin="Generated by Gradle"/> <sha256 value="26c5856e954b5f864db76f13b86919b59c6eecf9fd930b96baa8884626baf2f5" origin="Generated by Gradle"/>

View File

@@ -426,6 +426,8 @@ public final class FrequencyTrie<V> {
childNodeIds[edgeIndex] = dataInput.readInt(); childNodeIds[edgeIndex] = dataInput.readInt();
} }
validateSerializedEdges(nodeIndex, edgeLabels);
final int valueCount = dataInput.readInt(); final int valueCount = dataInput.readInt();
if (valueCount < 0) { if (valueCount < 0) {
throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount); throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
@@ -474,6 +476,28 @@ public final class FrequencyTrie<V> {
return nodes; return nodes;
} }
/**
* Validates the serialized edge-label sequence for one node.
*
* <p>
* Compiled nodes rely on binary search for child lookup and therefore require
* edge labels to be stored in strict ascending order without duplicates.
* Rejecting malformed streams here keeps lookup semantics deterministic and
* avoids silently constructing a trie whose search behavior would be undefined.
*
* @param nodeIndex serialized node identifier
* @param edgeLabels serialized edge labels
* @throws IOException if the edge labels are not strictly ascending
*/
private static void validateSerializedEdges(final int nodeIndex, final char... edgeLabels) throws IOException {
for (int edgeIndex = 1; edgeIndex < edgeLabels.length; edgeIndex++) {
if (edgeLabels[edgeIndex - 1] >= edgeLabels[edgeIndex]) {
throw new IOException("Edge labels must be strictly ascending at node " + nodeIndex + ", edge index "
+ edgeIndex + ": '" + edgeLabels[edgeIndex - 1] + "' then '" + edgeLabels[edgeIndex] + "'.");
}
}
}
/** /**
* Locates the compiled node for the supplied key. * Locates the compiled node for the supplied key.
* *

View File

@@ -117,7 +117,14 @@ public final class PatchCommandEncoder {
private static final int MISMATCH_PENALTY = 100; private static final int MISMATCH_PENALTY = 100;
/** /**
* Extra headroom added when internal matrices need to grow. * Extra matrix headroom reserved beyond the immediately required dimensions.
*
* <p>
* A small fixed margin reduces repeated reallocation when a caller encodes many
* similarly sized terms in sequence. The value is intentionally modest: large
* enough to absorb minor size fluctuations, yet small enough to avoid
* materially over-allocating the reused dynamic-programming matrices.
* </p>
*/ */
private static final int CAPACITY_MARGIN = 8; private static final int CAPACITY_MARGIN = 8;
@@ -288,6 +295,7 @@ public final class PatchCommandEncoder {
* @param patchCommand compact patch command * @param patchCommand compact patch command
* @return transformed word, or {@code null} when {@code source} is {@code null} * @return transformed word, or {@code null} when {@code source} is {@code null}
*/ */
@SuppressWarnings({ "PMD.CyclomaticComplexity", "PMD.AvoidLiteralsInIfCondition" })
public static String apply(String source, String patchCommand) { public static String apply(String source, String patchCommand) {
if (source == null) { if (source == null) {
return null; return null;
@@ -299,6 +307,10 @@ public final class PatchCommandEncoder {
return source; return source;
} }
if ((patchCommand.length() & 1) != 0) {
return source;
}
StringBuilder result = new StringBuilder(source); StringBuilder result = new StringBuilder(source);
if (result.isEmpty()) { if (result.isEmpty()) {
@@ -312,11 +324,14 @@ public final class PatchCommandEncoder {
char opcode = patchCommand.charAt(patchIndex); char opcode = patchCommand.charAt(patchIndex);
char argument = patchCommand.charAt(patchIndex + 1); char argument = patchCommand.charAt(patchIndex + 1);
int encodedCount = argument - 'a' + 1;
switch (opcode) { switch (opcode) {
case SKIP_OPCODE: case SKIP_OPCODE:
position = position - encodedCount + 1; final int skipCount = decodeEncodedCount(argument);
if (skipCount < 1) {
return source;
}
position = position - skipCount + 1;
break; break;
case REPLACE_OPCODE: case REPLACE_OPCODE:
@@ -324,8 +339,12 @@ public final class PatchCommandEncoder {
break; break;
case DELETE_OPCODE: case DELETE_OPCODE:
final int deleteCount = decodeEncodedCount(argument);
if (deleteCount < 1) {
return source;
}
int deleteEndExclusive = position + 1; int deleteEndExclusive = position + 1;
position -= encodedCount - 1; position -= deleteCount - 1;
result.delete(position, deleteEndExclusive); result.delete(position, deleteEndExclusive);
break; break;
@@ -353,6 +372,26 @@ public final class PatchCommandEncoder {
return result.toString(); return result.toString();
} }
/**
* Decodes a compact count argument used by skip and delete instructions.
*
* <p>
* Valid encoded counts start at {@code 'a'} for one affected character. Values
* below {@code 'a'} are malformed and are reported to callers via the
* compatibility fallback path rather than by throwing a dedicated exception.
* </p>
*
* @param argument serialized count argument
* @return decoded positive count, or {@code -1} when the argument is malformed
*/
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
private static int decodeEncodedCount(final char argument) {
if (argument < 'a') {
return -1;
}
return argument - 'a' + 1;
}
/** /**
* Applies a patch command to an empty source word. * Applies a patch command to an empty source word.
* *

View File

@@ -31,6 +31,7 @@
package org.egothor.stemmer.trie; package org.egothor.stemmer.trie;
import java.util.Arrays; import java.util.Arrays;
import java.util.Objects;
/** /**
* Immutable compiled trie node optimized for read access. * Immutable compiled trie node optimized for read access.
@@ -38,7 +39,9 @@ import java.util.Arrays;
* <p> * <p>
* The returned arrays are the internal backing storage of the compiled node. * The returned arrays are the internal backing storage of the compiled node.
* They are exposed for efficient access by closely related trie infrastructure * They are exposed for efficient access by closely related trie infrastructure
* and therefore must never be modified by callers. * and therefore must never be modified by callers. The node itself is still
* immutable from the public API perspective because construction wires these
* arrays once and all lookup operations thereafter treat them as read-only.
* *
* @param <V> value type * @param <V> value type
* @param edgeLabels internal edge label array * @param edgeLabels internal edge label array
@@ -46,8 +49,90 @@ import java.util.Arrays;
* @param orderedValues internal ordered values array * @param orderedValues internal ordered values array
* @param orderedCounts internal ordered counts array * @param orderedCounts internal ordered counts array
*/ */
@SuppressWarnings("PMD.DataClass")
public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[] orderedValues, int... orderedCounts) { public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[] orderedValues, int... orderedCounts) {
/**
* Creates one validated compiled node.
*
* @throws NullPointerException if any array argument is {@code null}
* @throws IllegalArgumentException if the edge-related arrays or value-related
* arrays do not have matching lengths
*/
public CompiledNode {
Objects.requireNonNull(edgeLabels, "edgeLabels");
Objects.requireNonNull(children, "children");
Objects.requireNonNull(orderedValues, "orderedValues");
Objects.requireNonNull(orderedCounts, "orderedCounts");
if (edgeLabels.length != children.length) {
throw new IllegalArgumentException("edgeLabels and children must have the same length.");
}
if (orderedValues.length != orderedCounts.length) {
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
}
}
/**
* Returns the internal edge-label array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal edge-label array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public char[] edgeLabels() {
return this.edgeLabels;
}
/**
* Returns the internal child-node array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only by external callers.
*
* @return internal child-node array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public CompiledNode<V>[] children() {
return this.children;
}
/**
* Returns the internal ordered-values array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal ordered-values array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public V[] orderedValues() {
return this.orderedValues;
}
/**
* Returns the internal ordered-counts array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal ordered-counts array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public int[] orderedCounts() {
return this.orderedCounts;
}
/** /**
* Finds a child for the supplied edge character. * Finds a child for the supplied edge character.
* *

View File

@@ -30,14 +30,18 @@
******************************************************************************/ ******************************************************************************/
package org.egothor.stemmer.trie; package org.egothor.stemmer.trie;
import java.util.Objects;
/** /**
* Intermediate node data used during deserialization before child references * Intermediate node data used during deserialization before child references
* are resolved. * are resolved.
* *
* <p> * <p>
* The arrays exposed by the accessors are the internal backing storage of this * The arrays exposed by the accessors are the internal backing storage of this
* holder. They are returned directly for efficiency and therefore must be * holder. They are returned directly for efficiency because the deserialization
* treated as read-only by callers. * pipeline copies references into immutable compiled nodes immediately after
* the record is created. Callers must therefore treat every returned array as
* read-only.
* *
* @param <V> value type * @param <V> value type
* @param edgeLabels edge labels * @param edgeLabels edge labels
@@ -45,6 +49,87 @@ package org.egothor.stemmer.trie;
* @param orderedValues ordered values * @param orderedValues ordered values
* @param orderedCounts ordered counts * @param orderedCounts ordered counts
*/ */
@SuppressWarnings("PMD.DataClass")
public record NodeData<V>(char[] edgeLabels, int[] childNodeIds, V[] orderedValues, int... orderedCounts) { public record NodeData<V>(char[] edgeLabels, int[] childNodeIds, V[] orderedValues, int... orderedCounts) {
/**
* Creates one validated node-data holder.
*
* @throws NullPointerException if any array argument is {@code null}
* @throws IllegalArgumentException if the edge-related arrays or value-related
* arrays do not have matching lengths
*/
public NodeData {
Objects.requireNonNull(edgeLabels, "edgeLabels");
Objects.requireNonNull(childNodeIds, "childNodeIds");
Objects.requireNonNull(orderedValues, "orderedValues");
Objects.requireNonNull(orderedCounts, "orderedCounts");
if (edgeLabels.length != childNodeIds.length) {
throw new IllegalArgumentException("edgeLabels and childNodeIds must have the same length.");
}
if (orderedValues.length != orderedCounts.length) {
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
}
}
/**
* Returns the internal edge-label array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal edge-label array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public char[] edgeLabels() {
return this.edgeLabels;
}
/**
* Returns the internal child-node identifier array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal child-node identifier array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public int[] childNodeIds() {
return this.childNodeIds;
}
/**
* Returns the internal ordered-values array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal ordered-values array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public V[] orderedValues() {
return this.orderedValues;
}
/**
* Returns the internal ordered-counts array.
*
* <p>
* The returned array is not copied for performance reasons and must be treated
* as read-only.
*
* @return internal ordered-counts array
*/
@Override
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public int[] orderedCounts() {
return this.orderedCounts;
}
} }

View File

@@ -0,0 +1,218 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertIterableEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.List;
import net.jqwik.api.ForAll;
import net.jqwik.api.Label;
import net.jqwik.api.Property;
import net.jqwik.api.Tag;
/**
* Property-based tests for the compiled trie abstraction.
*
* <p>
* These properties focus on deterministic compilation, observable lookup
* alignment, binary persistence stability, and safe reconstruction back into a
* writable builder. Together they guard the most valuable invariants of the
* core algorithm without overfitting to particular fixture data.
*/
@Label("FrequencyTrie properties")
@Tag("unit")
@Tag("property")
@Tag("trie")
class FrequencyTrieProperties extends PropertyBasedTestSupport {
/**
* Binary codec used by generic trie round-trip assertions.
*/
private static final FrequencyTrie.ValueStreamCodec<String> STRING_CODEC = new FrequencyTrie.ValueStreamCodec<>() {
@Override
public void write(final DataOutputStream dataOutput, final String value) throws IOException {
dataOutput.writeUTF(value);
}
@Override
public String read(final DataInputStream dataInput) throws IOException {
return dataInput.readUTF();
}
};
/**
* Verifies that compiling the same insertion scenario repeatedly yields the
* same observable lookups.
*
* @param scenario generated trie scenario
* @param reductionMode reduction mode
*/
@Property(tries = 80)
@Label("compilation should be deterministic for the same insertion scenario")
void compilationShouldBeDeterministicForTheSameInsertionScenario(
@ForAll("trieScenarios") final TrieScenario scenario, @ForAll final ReductionMode reductionMode) {
final FrequencyTrie<String> first = buildTrie(scenario, reductionMode);
final FrequencyTrie<String> second = buildTrie(scenario, reductionMode);
for (String key : scenario.observedKeys()) {
assertTrieStateEquals(first, second, key);
}
}
/**
* Verifies that {@link FrequencyTrie#get(String)},
* {@link FrequencyTrie#getAll(String)}, and
* {@link FrequencyTrie#getEntries(String)} remain aligned for every probed key.
*
* @param scenario generated trie scenario
* @param reductionMode reduction mode
*/
@Property(tries = 80)
@Label("get, getAll, and getEntries should stay semantically aligned")
void getGetAllAndGetEntriesShouldStaySemanticallyAligned(@ForAll("trieScenarios") final TrieScenario scenario,
@ForAll final ReductionMode reductionMode) {
final FrequencyTrie<String> trie = buildTrie(scenario, reductionMode);
for (String key : scenario.observedKeys()) {
final String preferred = trie.get(key);
final String[] allValues = trie.getAll(key);
final List<ValueCount<String>> entries = trie.getEntries(key);
assertEquals(allValues.length, entries.size(), "getAll() and getEntries() must have equal cardinality.");
if (allValues.length == 0) {
assertNull(preferred, "get() must return null when no terminal value exists.");
assertTrue(entries.isEmpty(), "getEntries() must be empty when getAll() is empty.");
continue;
}
assertEquals(allValues[0], preferred, "get() must expose the preferred first getAll() value.");
int previousCount = Integer.MAX_VALUE;
for (int index = 0; index < entries.size(); index++) {
final ValueCount<String> entry = entries.get(index);
assertEquals(allValues[index], entry.value(), "entry ordering must match getAll() ordering.");
assertTrue(entry.count() >= 1, "stored frequencies must remain positive.");
assertTrue(entry.count() <= previousCount, "entry counts must be ordered descending.");
previousCount = entry.count();
}
}
}
/**
* Verifies that binary serialization and deserialization preserve all
* observable lookup semantics for generated scenarios.
*
* @param scenario generated trie scenario
* @param reductionMode reduction mode
*/
@Property(tries = 40)
@Label("binary round-trip should preserve observable trie semantics")
void binaryRoundTripShouldPreserveObservableTrieSemantics(@ForAll("trieScenarios") final TrieScenario scenario,
@ForAll final ReductionMode reductionMode) {
final FrequencyTrie<String> original = buildTrie(scenario, reductionMode);
final FrequencyTrie<String> roundTripped = roundTrip(original);
for (String key : scenario.observedKeys()) {
assertTrieStateEquals(original, roundTripped, key);
}
}
/**
* Verifies that reconstructing a writable builder from a compiled trie and
* recompiling it preserves observable lookup semantics.
*
* @param scenario generated trie scenario
* @param reductionMode reduction mode
*/
@Property(tries = 60)
@Label("builder reconstruction should preserve observable trie semantics")
void builderReconstructionShouldPreserveObservableTrieSemantics(
@ForAll("trieScenarios") final TrieScenario scenario, @ForAll final ReductionMode reductionMode) {
final FrequencyTrie<String> original = buildTrie(scenario, reductionMode);
final FrequencyTrie<String> rebuilt = FrequencyTrieBuilders
.copyOf(original, STRING_ARRAY_FACTORY, reductionMode).build();
for (String key : scenario.observedKeys()) {
assertEquals(original.get(key), rebuilt.get(key), "preferred lookup must survive reconstruction.");
assertArrayEquals(original.getAll(key), rebuilt.getAll(key),
"complete ordered result set must survive reconstruction.");
}
}
/**
* Asserts full observable trie equality for one key.
*
* @param expected expected trie
* @param actual actual trie
* @param key key to probe
*/
private static void assertTrieStateEquals(final FrequencyTrie<String> expected, final FrequencyTrie<String> actual,
final String key) {
assertEquals(expected.get(key), actual.get(key), "preferred lookup drifted.");
assertArrayEquals(expected.getAll(key), actual.getAll(key), "ordered result set drifted.");
assertIterableEquals(expected.getEntries(key), actual.getEntries(key), "entry list drifted.");
}
/**
* Round-trips one trie through its binary representation.
*
* @param trie trie to persist and reload
* @return reloaded trie
*/
private static FrequencyTrie<String> roundTrip(final FrequencyTrie<String> trie) {
try {
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try (DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream)) {
trie.writeTo(dataOutputStream, STRING_CODEC);
}
try (DataInputStream dataInputStream = new DataInputStream(
new ByteArrayInputStream(byteArrayOutputStream.toByteArray()))) {
return FrequencyTrie.readFrom(dataInputStream, STRING_ARRAY_FACTORY, STRING_CODEC);
}
} catch (IOException exception) {
throw new UncheckedIOException("Unexpected binary round-trip failure.", exception);
}
}
}

View File

@@ -733,6 +733,30 @@ class FrequencyTrieTest {
assertTrue(exception.getMessage().contains("Invalid root node id")); assertTrue(exception.getMessage().contains("Invalid root node id"));
} }
/**
* Verifies that deserialization rejects unsorted or duplicate serialized edge
* labels because compiled lookup relies on binary search over a strictly
* ascending edge array.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom rejects non-ascending serialized edge labels")
void readFromRejectsNonAscendingSerializedEdgeLabels() {
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
dataOutput.writeInt(2);
dataOutput.writeChar('b');
dataOutput.writeInt(0);
dataOutput.writeChar('a');
dataOutput.writeInt(0);
dataOutput.writeInt(0);
} });
final IOException exception = assertThrows(IOException.class,
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
assertTrue(exception.getMessage().contains("Edge labels must be strictly ascending"));
}
/** /**
* Verifies that deserialization rejects non-positive stored counts. * Verifies that deserialization rejects non-positive stored counts.
*/ */

View File

@@ -0,0 +1,308 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertIterableEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Set;
import java.util.function.IntFunction;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
/**
* Deterministic fuzz-style tests for trie compilation and generated stemming
* dictionaries.
*
* <p>
* These tests exercise bounded pseudo-random inputs with fixed seeds. The suite
* focuses on invariants that are meaningful for CI: compilation must remain
* stable, lookups must remain deterministic, binary round-trips must preserve
* observable behavior, and generated patch commands must reconstruct one of the
* stems declared by the source dictionary.
*/
@DisplayName("Deterministic fuzz-style trie and stemmer compilation")
@Tag("unit")
@Tag("fuzz")
@Tag("trie")
@Tag("stemming")
class FuzzStemmerAndTrieCompilationTest {
/**
* Shared array factory used by generated tries.
*/
private static final IntFunction<String[]> ARRAY_FACTORY = String[]::new;
/**
* Binary codec used for generic trie round-trip assertions.
*/
private static final FrequencyTrie.ValueStreamCodec<String> STRING_CODEC = new FrequencyTrie.ValueStreamCodec<String>() {
@Override
public void write(final DataOutputStream dataOutput, final String value) throws IOException {
dataOutput.writeUTF(value);
}
@Override
public String read(final DataInputStream dataInput) throws IOException {
return dataInput.readUTF();
}
};
/**
* Temporary directory for generated dictionaries and binary artifacts.
*/
@TempDir
Path temporaryDirectory;
/**
* Verifies that bounded pseudo-random trie insertions compile deterministically
* and preserve observable semantics across rebuild, binary serialization, and
* builder reconstruction.
*
* @throws IOException if an unexpected binary I/O failure occurs
*/
@Test
@DisplayName("generated trie insertions should preserve semantics across compilation forms")
void generatedTrieInsertionsShouldPreserveSemanticsAcrossCompilationForms() throws IOException {
for (ReductionMode reductionMode : ReductionMode.values()) {
final ReductionSettings reductionSettings = ReductionSettings.withDefaults(reductionMode);
for (FuzzTestSupport.TrieCompilationScenario scenario : FuzzTestSupport.trieCompilationScenarios()
.toList()) {
final FrequencyTrie<String> compiled = buildTrie(scenario, reductionSettings);
final FrequencyTrie<String> rebuilt = buildTrie(scenario, reductionSettings);
final FrequencyTrie<String> roundTripped = roundTrip(compiled);
final FrequencyTrie<String> reconstructed = FrequencyTrieBuilders.copyOf(compiled, ARRAY_FACTORY,
reductionSettings).build();
for (String key : scenario.observedKeys()) {
assertTrieStateEquals(compiled, rebuilt, key,
describeScenario("repeated compilation drifted", reductionMode, scenario, key));
assertTrieStateEquals(compiled, roundTripped, key,
describeScenario("binary round-trip drifted", reductionMode, scenario, key));
assertTrieLookupSemanticsEqual(compiled, reconstructed, key,
describeScenario("builder reconstruction drifted", reductionMode, scenario, key));
}
}
}
}
/**
* Verifies that generated dictionaries compile without failure and that the
* preferred patch command for each generated word reconstructs one acceptable
* source stem.
*
* @throws IOException if the generated dictionary cannot be written or read
*/
@Test
@DisplayName("generated dictionaries should compile and stem consistently")
void generatedDictionariesShouldCompileAndStemConsistently() throws IOException {
for (ReductionMode reductionMode : ReductionMode.values()) {
for (FuzzTestSupport.StemmerDictionaryScenario scenario : FuzzTestSupport.stemmerDictionaryScenarios()
.toList()) {
final Path dictionaryFile = this.temporaryDirectory
.resolve("fuzz-dictionary-" + reductionMode.name() + "-" + scenario.seed() + ".txt");
Files.writeString(dictionaryFile, scenario.dictionaryContent(), StandardCharsets.UTF_8);
final FrequencyTrie<String> trie = assertDoesNotThrow(
() -> StemmerPatchTrieLoader.load(dictionaryFile, true, reductionMode),
describeScenario("generated dictionary must compile", reductionMode, scenario, null));
for (String word : scenario.expectedStemsByWord().keySet()) {
final Set<String> acceptableStems = scenario.expectedStemsByWord().get(word);
final String preferredPatch = trie.get(word);
final String[] allPatches = trie.getAll(word);
assertAll(
() -> assertTrue(preferredPatch != null && !preferredPatch.isEmpty(),
describeScenario("preferred patch must exist", reductionMode, scenario, word)),
() -> assertTrue(allPatches.length >= 1,
describeScenario("at least one patch must exist", reductionMode, scenario, word)),
() -> assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(word, preferredPatch)),
describeScenario("preferred patch reconstructed an unexpected stem",
reductionMode, scenario, word)),
() -> assertTrue(allPatchesProduceOnlyAcceptableStems(word, allPatches, acceptableStems),
describeScenario("getAll() contained a patch outside the accepted stem set",
reductionMode, scenario, word)));
}
}
}
}
/**
* Verifies that binary persistence of generated stemmer tries preserves all
* observable lookups for the generated vocabulary.
*
* @throws IOException if persistence unexpectedly fails
*/
@Test
@DisplayName("generated stemmer tries should survive binary persistence")
void generatedStemmerTriesShouldSurviveBinaryPersistence() throws IOException {
for (FuzzTestSupport.StemmerDictionaryScenario scenario : FuzzTestSupport.stemmerDictionaryScenarios()
.toList()) {
final Path dictionaryFile = this.temporaryDirectory.resolve("binary-fuzz-" + scenario.seed() + ".txt");
final Path binaryFile = this.temporaryDirectory.resolve("binary-fuzz-" + scenario.seed() + ".dat.gz");
Files.writeString(dictionaryFile, scenario.dictionaryContent(), StandardCharsets.UTF_8);
final FrequencyTrie<String> original = StemmerPatchTrieLoader.load(dictionaryFile, true,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
StemmerPatchTrieLoader.saveBinary(original, binaryFile);
final FrequencyTrie<String> reloaded = StemmerPatchTrieLoader.loadBinary(binaryFile);
for (String word : scenario.expectedStemsByWord().keySet()) {
assertTrieStateEquals(original, reloaded, word,
"Binary stemmer round-trip drifted for seed=" + scenario.seed() + ", word='" + word + "'.");
}
}
}
/**
* Builds one trie from the supplied generated scenario.
*
* @param scenario generated scenario
* @param reductionSettings reduction settings
* @return compiled trie
*/
private static FrequencyTrie<String> buildTrie(final FuzzTestSupport.TrieCompilationScenario scenario,
final ReductionSettings reductionSettings) {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(ARRAY_FACTORY, reductionSettings);
for (FuzzTestSupport.TrieInsertion insertion : scenario.insertions()) {
builder.put(insertion.key(), insertion.value(), insertion.count());
}
return builder.build();
}
/**
* Performs a generic binary round-trip of a compiled trie.
*
* @param trie source trie
* @return deserialized trie
* @throws IOException if persistence fails
*/
private static FrequencyTrie<String> roundTrip(final FrequencyTrie<String> trie) throws IOException {
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
trie.writeTo(outputStream, STRING_CODEC);
return FrequencyTrie.readFrom(new ByteArrayInputStream(outputStream.toByteArray()), ARRAY_FACTORY, STRING_CODEC);
}
/**
* Compares all observable lookup views for one key.
*
* @param expected reference trie
* @param actual candidate trie
* @param key key to inspect
* @param failureMessage assertion message
*/
private static void assertTrieStateEquals(final FrequencyTrie<String> expected, final FrequencyTrie<String> actual,
final String key, final String failureMessage) {
assertAll(
() -> assertEquals(expected.get(key), actual.get(key), failureMessage),
() -> assertArrayEquals(expected.getAll(key), actual.getAll(key), failureMessage),
() -> assertIterableEquals(expected.getEntries(key), actual.getEntries(key), failureMessage));
}
/**
* Compares only lookup semantics that are expected to survive reconstruction
* from a reduced compiled trie.
*
* <p>
* Some reduction modes intentionally ignore absolute local frequencies when
* identifying equivalent subtrees. Reconstructing a mutable builder from the
* reduced compiled form and compiling it again must therefore preserve
* observable lookup semantics, but it does not necessarily preserve original
* local counts reported by {@link FrequencyTrie#getEntries(String)}.
*
* @param expected reference trie
* @param actual candidate trie
* @param key key to inspect
* @param failureMessage assertion message
*/
private static void assertTrieLookupSemanticsEqual(final FrequencyTrie<String> expected,
final FrequencyTrie<String> actual, final String key, final String failureMessage) {
assertAll(
() -> assertEquals(expected.get(key), actual.get(key), failureMessage),
() -> assertArrayEquals(expected.getAll(key), actual.getAll(key), failureMessage));
}
/**
* Verifies that every patch in the array reconstructs one acceptable stem.
*
* @param word original surface form
* @param patches patch commands
* @param acceptableStems acceptable stems
* @return {@code true} when all patches are acceptable
*/
private static boolean allPatchesProduceOnlyAcceptableStems(final String word, final String[] patches,
final Set<String> acceptableStems) {
for (String patch : patches) {
if (!acceptableStems.contains(PatchCommandEncoder.apply(word, patch))) {
return false;
}
}
return true;
}
/**
* Builds a contextual assertion message.
*
* @param prefix failure prefix
* @param reductionMode reduction mode under test
* @param scenario source scenario
* @param word current word or key, may be {@code null}
* @return contextual message
*/
private static String describeScenario(final String prefix, final ReductionMode reductionMode, final Object scenario,
final String word) {
final StringBuilder builder = new StringBuilder(128);
builder.append(prefix).append(". reductionMode=").append(reductionMode).append(", scenario=")
.append(scenario);
if (word != null) {
builder.append(", token='").append(word).append('\'');
}
return builder.toString();
}
}

View File

@@ -0,0 +1,339 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Random;
import java.util.Set;
import java.util.stream.Stream;
/**
* Deterministic support utilities for fuzz-style tests of trie compilation and
* stemming dictionary loading.
*
* <p>
* The generators in this helper intentionally use bounded input sizes and fixed
* seeds so that the resulting tests remain reproducible and suitable for CI.
* The goal is not statistical randomness, but broad structured coverage of
* unusual combinations that are cumbersome to author manually.
*/
final class FuzzTestSupport {
/**
* Shared deterministic seeds used across all generated scenarios.
*/
private static final long[] SEEDS = { 7L, 19L, 43L, 71L, 101L, 211L };
/**
* Lower-case alphabet used for generated word material.
*/
private static final char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz".toCharArray();
/**
* Utility class.
*/
private FuzzTestSupport() {
throw new AssertionError("No instances.");
}
/**
* Returns deterministic trie-compilation scenarios.
*
* @return stream of bounded deterministic scenarios
*/
static Stream<TrieCompilationScenario> trieCompilationScenarios() {
final List<TrieCompilationScenario> scenarios = new ArrayList<>(SEEDS.length);
for (long seed : SEEDS) {
scenarios.add(createTrieCompilationScenario(seed));
}
return scenarios.stream();
}
/**
* Returns deterministic stemmer-dictionary scenarios.
*
* @return stream of bounded deterministic scenarios
*/
static Stream<StemmerDictionaryScenario> stemmerDictionaryScenarios() {
final List<StemmerDictionaryScenario> scenarios = new ArrayList<>(SEEDS.length);
for (long seed : SEEDS) {
scenarios.add(createStemmerDictionaryScenario(seed));
}
return scenarios.stream();
}
/**
* Creates one trie scenario with repeated insertions, empty-key coverage, and a
* stable set of observed keys.
*
* @param seed deterministic seed
* @return generated scenario
*/
private static TrieCompilationScenario createTrieCompilationScenario(final long seed) {
final Random random = new Random(seed);
final List<TrieInsertion> insertions = new ArrayList<>();
final Set<String> observedKeys = new LinkedHashSet<>();
observedKeys.add("");
final int insertionCount = 50 + random.nextInt(15);
for (int index = 0; index < insertionCount; index++) {
final String key = random.nextInt(8) == 0 ? "" : nextWord(random, 1, 10);
final String value = nextWord(random, 0, 8);
final int count = 1 + random.nextInt(4);
insertions.add(new TrieInsertion(key, value, count));
observedKeys.add(key);
if (!key.isEmpty() && random.nextBoolean()) {
observedKeys.add(key.substring(0, Math.max(0, key.length() - 1)));
}
observedKeys.add(nextWord(random, 1, 8));
}
return new TrieCompilationScenario(seed, List.copyOf(insertions), List.copyOf(observedKeys));
}
/**
* Creates one dictionary scenario made of compact stem-to-variants groups.
*
* @param seed deterministic seed
* @return generated scenario
*/
private static StemmerDictionaryScenario createStemmerDictionaryScenario(final long seed) {
final Random random = new Random(seed);
final Map<String, Set<String>> expectedStemsByWord = new LinkedHashMap<>();
final StringBuilder dictionary = new StringBuilder(512);
dictionary.append("# deterministic fuzz dictionary seed ").append(seed).append('\n');
dictionary.append("// blank and remark handling is part of the exercised input\n\n");
final int entryCount = 18 + random.nextInt(8);
for (int index = 0; index < entryCount; index++) {
final String stem = nextWord(random, 1, 8);
final LinkedHashSet<String> variants = new LinkedHashSet<>();
final int variantCount = 1 + random.nextInt(4);
while (variants.size() < variantCount) {
if (random.nextInt(6) == 0) {
variants.add(stem);
} else {
variants.add(createVariant(random, stem));
}
}
dictionary.append(stem);
for (String variant : variants) {
dictionary.append(' ').append(variant);
expectedStemsByWord.computeIfAbsent(variant, ignored -> new LinkedHashSet<>()).add(stem);
}
dictionary.append(" # entry ").append(index).append('\n');
if (random.nextInt(5) == 0) {
dictionary.append("\n");
}
}
return new StemmerDictionaryScenario(seed, dictionary.toString(), immutableMapOfSets(expectedStemsByWord));
}
/**
* Creates a variant related to a supplied stem.
*
* @param random source of deterministic pseudo-randomness
* @param stem canonical stem
* @return generated variant
*/
private static String createVariant(final Random random, final String stem) {
final int mode = random.nextInt(6);
switch (mode) {
case 0:
return stem + suffix(random);
case 1:
return prefix(random) + stem;
case 2:
return stem.length() > 1 ? stem.substring(0, stem.length() - 1) + nextLetter(random) : stem + nextLetter(random);
case 3:
return stem + nextLetter(random) + nextLetter(random);
case 4:
return stem.length() > 2 ? stem.substring(0, stem.length() - 2) : stem;
default:
return new StringBuilder(stem).reverse().append(nextLetter(random)).toString();
}
}
/**
* Returns a generated word in lower case.
*
* @param random source of deterministic pseudo-randomness
* @param minLength minimum inclusive length
* @param maxLength maximum inclusive length
* @return generated word
*/
private static String nextWord(final Random random, final int minLength, final int maxLength) {
final int length = minLength + random.nextInt(maxLength - minLength + 1);
final StringBuilder builder = new StringBuilder(length);
for (int index = 0; index < length; index++) {
builder.append(nextLetter(random));
}
return builder.toString().toLowerCase(Locale.ROOT);
}
/**
* Returns one generated prefix fragment.
*
* @param random source of deterministic pseudo-randomness
* @return prefix fragment
*/
private static String prefix(final Random random) {
return String.valueOf(nextLetter(random));
}
/**
* Returns one generated suffix fragment.
*
* @param random source of deterministic pseudo-randomness
* @return suffix fragment
*/
private static String suffix(final Random random) {
final String[] suffixes = { "s", "ed", "ing", "er", "ly", "ness", "ment" };
return suffixes[random.nextInt(suffixes.length)];
}
/**
* Returns one generated lower-case letter.
*
* @param random source of deterministic pseudo-randomness
* @return generated character
*/
private static char nextLetter(final Random random) {
return ALPHABET[random.nextInt(ALPHABET.length)];
}
/**
* Creates an immutable map view whose nested sets are also immutable.
*
* @param source mutable source map
* @return immutable copy
*/
private static Map<String, Set<String>> immutableMapOfSets(final Map<String, Set<String>> source) {
final Map<String, Set<String>> copy = new LinkedHashMap<>(source.size());
for (Map.Entry<String, Set<String>> entry : source.entrySet()) {
copy.put(entry.getKey(), Set.copyOf(entry.getValue()));
}
return Map.copyOf(copy);
}
/**
* Generated trie scenario for deterministic fuzz testing.
*
* @param seed deterministic seed
* @param insertions generated insertions to apply to the builder
* @param observedKeys keys that should be checked after compilation
*/
record TrieCompilationScenario(long seed, List<TrieInsertion> insertions, List<String> observedKeys) {
/**
* Creates a validated scenario.
*
* @param seed deterministic seed
* @param insertions generated insertions to apply to the builder
* @param observedKeys keys that should be checked after compilation
*/
TrieCompilationScenario {
Objects.requireNonNull(insertions, "insertions");
Objects.requireNonNull(observedKeys, "observedKeys");
}
@Override
public String toString() {
return "seed=" + this.seed;
}
}
/**
* One generated insertion into a trie builder.
*
* @param key target key
* @param value stored value
* @param count positive occurrence count
*/
record TrieInsertion(String key, String value, int count) {
/**
* Creates a validated insertion.
*
* @param key target key
* @param value stored value
* @param count positive occurrence count
*/
TrieInsertion {
Objects.requireNonNull(key, "key");
Objects.requireNonNull(value, "value");
if (count < 1) {
throw new IllegalArgumentException("count must be positive.");
}
}
}
/**
* Generated dictionary scenario for deterministic fuzz testing of stemming.
*
* @param seed deterministic seed
* @param dictionaryContent generated dictionary content
* @param expectedStemsByWord acceptable stems for each generated word
*/
record StemmerDictionaryScenario(long seed, String dictionaryContent, Map<String, Set<String>> expectedStemsByWord) {
/**
* Creates a validated scenario.
*
* @param seed deterministic seed
* @param dictionaryContent generated dictionary content
* @param expectedStemsByWord acceptable stems for each generated word
*/
StemmerDictionaryScenario {
Objects.requireNonNull(dictionaryContent, "dictionaryContent");
Objects.requireNonNull(expectedStemsByWord, "expectedStemsByWord");
}
@Override
public String toString() {
return "seed=" + this.seed;
}
}
}

View File

@@ -0,0 +1,93 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import net.jqwik.api.ForAll;
import net.jqwik.api.Label;
import net.jqwik.api.Property;
import net.jqwik.api.Tag;
/**
* Property-based tests for {@link PatchCommandEncoder}.
*
* <p>
* These properties protect the most important behavioral contract of the patch
* language: encoding must be deterministic and applying an encoded patch must
* reconstruct the exact requested target.
*/
@Label("PatchCommandEncoder properties")
@Tag("unit")
@Tag("property")
@Tag("patch")
class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
/**
* Verifies that encoding followed by application reconstructs the original
* target word for bounded generated inputs.
*
* @param source source word
* @param target target word
*/
@Property(tries = 200)
@Label("encode followed by apply should reconstruct the target word")
void encodeFollowedByApplyShouldReconstructTheTargetWord(@ForAll("words") final String source,
@ForAll("words") final String target) {
final PatchCommandEncoder encoder = new PatchCommandEncoder();
final String patch = encoder.encode(source, target);
assertNotNull(patch, "patch generation must succeed for non-null inputs.");
assertEquals(target, PatchCommandEncoder.apply(source, patch),
"applying the encoded patch must reconstruct the target word.");
}
/**
* Verifies that encoding is deterministic for the same source-target pair, both
* within one encoder instance and across fresh instances.
*
* @param source source word
* @param target target word
*/
@Property(tries = 150)
@Label("encode should be deterministic for one source-target pair")
void encodeShouldBeDeterministicForOneSourceTargetPair(@ForAll("words") final String source,
@ForAll("words") final String target) {
final PatchCommandEncoder sharedEncoder = new PatchCommandEncoder();
final String first = sharedEncoder.encode(source, target);
final String second = sharedEncoder.encode(source, target);
final String fresh = new PatchCommandEncoder().encode(source, target);
assertEquals(first, second, "one encoder instance must produce stable output.");
assertEquals(first, fresh, "fresh encoder instances must produce the same patch output.");
}
}

View File

@@ -174,7 +174,13 @@ class PatchCommandEncoderTest {
// 9 // 9
Arguments.of(9, "", "-a"), Arguments.of(9, "", "-a"),
// 10 // 10
Arguments.of(10, "", "Ra")); Arguments.of(10, "", "Ra"),
// 11
Arguments.of(11, "abc", "D`"),
// 12
Arguments.of(12, "abc", "-`"),
// 13
Arguments.of(13, "", "D`"));
} }
/** /**

View File

@@ -0,0 +1,326 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.function.IntFunction;
import net.jqwik.api.Arbitraries;
import net.jqwik.api.Arbitrary;
import net.jqwik.api.Combinators;
import net.jqwik.api.Provide;
import net.jqwik.api.arbitraries.ListArbitrary;
/**
* Shared jqwik generators and helpers for property-based tests covering the
* Radixor algorithmic core.
*
* <p>
* The generated domains are intentionally bounded to keep CI execution time
* predictable while still exploring a broad range of trie shapes, duplicate
* insertions, missing lookups, and patch-command transformations.
*/
abstract class PropertyBasedTestSupport {
/**
* Shared array factory for string tries.
*/
protected static final IntFunction<String[]> STRING_ARRAY_FACTORY = String[]::new;
/**
* Provides bounded lowercase words suitable for trie keys, stems, and patch
* encoder inputs.
*
* @return bounded word generator
*/
@Provide
protected Arbitrary<String> words() {
return Arbitraries.strings().withChars('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l')
.ofMinLength(0).ofMaxLength(12);
}
/**
* Provides non-empty lowercase words suitable for dictionary variants and
* stems.
*
* @return bounded non-empty word generator
*/
@Provide
protected Arbitrary<String> nonEmptyWords() {
return Arbitraries.strings().withChars('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l')
.ofMinLength(1).ofMaxLength(12);
}
/**
* Provides bounded insertion scenarios for trie-focused properties.
*
* @return trie scenario generator
*/
@Provide
protected Arbitrary<TrieScenario> trieScenarios() {
final Arbitrary<TrieInsertion> insertionArbitrary = Combinators
.combine(words(), nonEmptyWords(), Arbitraries.integers().between(1, 5)).as(TrieInsertion::new);
final ListArbitrary<TrieInsertion> insertions = insertionArbitrary.list().ofMinSize(1).ofMaxSize(24);
final Arbitrary<List<String>> observedKeys = words().list().ofMinSize(0).ofMaxSize(16);
return Combinators.combine(insertions, observedKeys)
.as((scenarioInsertions, additionalObservedKeys) -> new TrieScenario(scenarioInsertions,
mergeObservedKeys(scenarioInsertions, additionalObservedKeys)));
}
/**
* Provides bounded stemmer scenarios where each variant word maps to one or
* more acceptable stems.
*
* @return stemmer scenario generator
*/
@Provide
protected Arbitrary<StemmerScenario> stemmerScenarios() {
final Arbitrary<StemmerEntry> entryArbitrary = Combinators
.combine(nonEmptyWords(), nonEmptyWords().set().ofMinSize(1).ofMaxSize(4)).as((stem, variants) -> {
final LinkedHashSet<String> normalizedVariants = new LinkedHashSet<>(variants);
normalizedVariants.add(stem);
return new StemmerEntry(stem, normalizedVariants);
});
return entryArbitrary.list().ofMinSize(1).ofMaxSize(10).map(StemmerScenario::new);
}
/**
* Builds a compiled trie from one generated scenario.
*
* @param scenario trie scenario
* @param reductionMode reduction mode
* @return compiled trie
*/
protected FrequencyTrie<String> buildTrie(final TrieScenario scenario, final ReductionMode reductionMode) {
Objects.requireNonNull(scenario, "scenario");
Objects.requireNonNull(reductionMode, "reductionMode");
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
for (TrieInsertion insertion : scenario.insertions()) {
builder.put(insertion.key(), insertion.value(), insertion.count());
}
return builder.build();
}
/**
* Builds a patch-command trie from one generated stemmer scenario.
*
* @param scenario stemmer scenario
* @param reductionMode reduction mode
* @param storeOriginal whether original stems should be stored using the
* canonical no-op patch
* @return compiled patch-command trie
*/
protected FrequencyTrie<String> buildStemmerTrie(final StemmerScenario scenario, final ReductionMode reductionMode,
final boolean storeOriginal) {
Objects.requireNonNull(scenario, "scenario");
Objects.requireNonNull(reductionMode, "reductionMode");
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
final PatchCommandEncoder encoder = new PatchCommandEncoder();
for (StemmerEntry entry : scenario.entries()) {
if (storeOriginal) {
builder.put(entry.stem(), PatchCommandEncoder.NOOP_PATCH);
}
for (String variant : entry.variants()) {
if (!variant.equals(entry.stem())) {
builder.put(variant, encoder.encode(variant, entry.stem()));
}
}
}
return builder.build();
}
/**
* Merges observed lookup keys while preserving order and keeping scenario keys
* relevant to actual trie content.
*
* @param insertions inserted trie mappings
* @param additionalObservedKeys extra lookup probes
* @return merged lookup-key set
*/
private static Set<String> mergeObservedKeys(final List<TrieInsertion> insertions,
final List<String> additionalObservedKeys) {
final LinkedHashSet<String> observedKeys = new LinkedHashSet<>();
for (TrieInsertion insertion : insertions) {
observedKeys.add(insertion.key());
}
observedKeys.addAll(additionalObservedKeys);
return observedKeys;
}
/**
* Generated insertion into a trie builder.
*
* @param key trie key
* @param value stored value
* @param count positive insertion count
*/
protected record TrieInsertion(String key, String value, int count) {
/**
* Creates a validated insertion descriptor.
*
* @param key trie key
* @param value stored value
* @param count positive insertion count
*/
public TrieInsertion {
Objects.requireNonNull(key, "key");
Objects.requireNonNull(value, "value");
if (count < 1) {
throw new IllegalArgumentException("count must be at least 1.");
}
}
}
/**
* Generated trie scenario used by multiple properties.
*
* @param insertions generated insertions
* @param observedKeys lookup probes
*/
protected record TrieScenario(List<TrieInsertion> insertions, Set<String> observedKeys) {
/**
* Creates a validated trie scenario.
*
* @param insertions generated insertions
* @param observedKeys lookup probes
*/
public TrieScenario {
Objects.requireNonNull(insertions, "insertions");
Objects.requireNonNull(observedKeys, "observedKeys");
insertions = List.copyOf(insertions);
observedKeys = Set.copyOf(observedKeys);
if (insertions.isEmpty()) {
throw new IllegalArgumentException("insertions must not be empty.");
}
}
@Override
public String toString() {
return "TrieScenario[insertions=" + this.insertions.size() + ", observedKeys=" + this.observedKeys.size()
+ "]";
}
}
/**
* Generated stemmer dictionary line equivalent.
*
* @param stem canonical stem
* @param variants variants accepted for the stem
*/
protected record StemmerEntry(String stem, Set<String> variants) {
/**
* Creates a validated stemmer entry.
*
* @param stem canonical stem
* @param variants variants accepted for the stem
*/
public StemmerEntry {
Objects.requireNonNull(stem, "stem");
Objects.requireNonNull(variants, "variants");
variants = Set.copyOf(variants);
if (stem.isEmpty()) {
throw new IllegalArgumentException("stem must not be empty.");
}
if (variants.isEmpty()) {
throw new IllegalArgumentException("variants must not be empty.");
}
}
}
/**
* Generated stemmer scenario used by patch-command trie properties.
*
* @param entries generated entries
*/
protected record StemmerScenario(List<StemmerEntry> entries) {
/**
* Creates a validated stemmer scenario.
*
* @param entries generated entries
*/
public StemmerScenario {
Objects.requireNonNull(entries, "entries");
entries = List.copyOf(entries);
if (entries.isEmpty()) {
throw new IllegalArgumentException("entries must not be empty.");
}
}
/**
* Returns all known source words that should be probeable in the resulting
* trie.
*
* @return observed lookup words
*/
public Set<String> observedWords() {
final LinkedHashSet<String> observedWords = new LinkedHashSet<>();
for (StemmerEntry entry : this.entries) {
observedWords.add(entry.stem());
observedWords.addAll(entry.variants());
}
return observedWords;
}
/**
* Returns all acceptable stems for one observed word.
*
* @param word observed word
* @return acceptable stems
*/
public Set<String> acceptableStemsFor(final String word) {
final LinkedHashSet<String> stems = new LinkedHashSet<>();
for (StemmerEntry entry : this.entries) {
if (entry.stem().equals(word) || entry.variants().contains(word)) {
stems.add(entry.stem());
}
}
return stems;
}
@Override
public String toString() {
return "StemmerScenario[entries=" + this.entries.size() + "]";
}
}
}

View File

@@ -0,0 +1,151 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.LinkedHashSet;
import java.util.Set;
import net.jqwik.api.ForAll;
import net.jqwik.api.Label;
import net.jqwik.api.Property;
import net.jqwik.api.Tag;
/**
* Property-based tests for patch-command stemmer tries.
*
* <p>
* These properties verify the most important semantic contract of compiled
* stemmer dictionaries: every patch returned for a known input word must decode
* to one of the acceptable stems declared by the source scenario, and binary
* persistence must not alter that behavior.
*/
@Label("Stemmer patch trie properties")
@Tag("unit")
@Tag("property")
@Tag("stemming")
class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
/**
* Verifies that every returned patch reconstructs only acceptable stems for the
* observed word set represented by one generated stemmer scenario.
*
* @param scenario generated stemmer scenario
* @param reductionMode reduction mode
*/
@Property(tries = 60)
@Label("returned patches should reconstruct only acceptable stems")
void returnedPatchesShouldReconstructOnlyAcceptableStems(@ForAll("stemmerScenarios") final StemmerScenario scenario,
@ForAll final ReductionMode reductionMode) {
final FrequencyTrie<String> trie = buildStemmerTrie(scenario, reductionMode, true);
for (String observedWord : scenario.observedWords()) {
final Set<String> acceptableStems = scenario.acceptableStemsFor(observedWord);
final String preferredPatch = trie.get(observedWord);
final String[] allPatches = trie.getAll(observedWord);
assertTrue(preferredPatch != null && !preferredPatch.isEmpty(),
"preferred patch must exist for an observed word.");
assertTrue(allPatches.length >= 1, "at least one patch must exist for an observed word.");
assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(observedWord, preferredPatch)),
"preferred patch reconstructed an unexpected stem.");
final Set<String> producedStems = applyAll(observedWord, allPatches);
assertTrue(acceptableStems.containsAll(producedStems),
"getAll() must not expose a patch that reconstructs an undeclared stem.");
if (acceptableStems.contains(observedWord)) {
assertTrue(producedStems.contains(observedWord),
"storeOriginal semantics must preserve the original stem among returned results.");
}
}
}
/**
* Verifies that GZip-compressed binary persistence preserves patch-command trie
* lookups.
*
* @param scenario generated stemmer scenario
* @param reductionMode reduction mode
*/
@Property(tries = 30)
@Label("binary persistence should preserve patch-command trie lookups")
void binaryPersistenceShouldPreservePatchCommandTrieLookups(
@ForAll("stemmerScenarios") final StemmerScenario scenario, @ForAll final ReductionMode reductionMode) {
final FrequencyTrie<String> original = buildStemmerTrie(scenario, reductionMode, true);
final FrequencyTrie<String> roundTripped = roundTripCompressed(original);
for (String observedWord : scenario.observedWords()) {
assertEquals(original.get(observedWord), roundTripped.get(observedWord),
"preferred patch lookup drifted after persistence.");
assertArrayEquals(original.getAll(observedWord), roundTripped.getAll(observedWord),
"complete patch result set drifted after persistence.");
}
}
/**
* Applies all returned patches to the supplied source word.
*
* @param source source word
* @param patches returned patches
* @return decoded stem set
*/
private static Set<String> applyAll(final String source, final String[] patches) {
final LinkedHashSet<String> stems = new LinkedHashSet<>();
for (String patch : patches) {
stems.add(PatchCommandEncoder.apply(source, patch));
}
return stems;
}
/**
* Round-trips one patch-command trie through the compressed binary helper.
*
* @param trie trie to persist and reload
* @return reloaded trie
*/
private static FrequencyTrie<String> roundTripCompressed(final FrequencyTrie<String> trie) {
try {
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
StemmerPatchTrieBinaryIO.write(trie, byteArrayOutputStream);
return StemmerPatchTrieBinaryIO.read(new ByteArrayInputStream(byteArrayOutputStream.toByteArray()));
} catch (IOException exception) {
throw new UncheckedIOException("Unexpected compressed binary round-trip failure.", exception);
}
}
}

View File

@@ -0,0 +1,148 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link CompiledNode} and {@link NodeData} validation and
* documented backing-array exposure.
*/
@Tag("unit")
@Tag("fast")
@Tag("trie")
@DisplayName("CompiledNode and NodeData")
class CompiledNodeAndNodeDataTest {
/**
* Verifies that {@link NodeData} rejects mismatched edge-related array lengths.
*/
@Test
@DisplayName("NodeData rejects mismatched edge arrays")
void nodeDataShouldRejectMismatchedEdgeArrays() {
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new NodeData<String>(new char[] { 'a' }, new int[0], new String[0], new int[0]));
assertEquals("edgeLabels and childNodeIds must have the same length.", exception.getMessage());
}
/**
* Verifies that {@link NodeData} rejects mismatched value-related array
* lengths.
*/
@Test
@DisplayName("NodeData rejects mismatched value arrays")
void nodeDataShouldRejectMismatchedValueArrays() {
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new NodeData<String>(new char[0], new int[0], new String[] { "stem" }, new int[0]));
assertEquals("orderedValues and orderedCounts must have the same length.", exception.getMessage());
}
/**
* Verifies that {@link NodeData} continues to expose the documented backing
* arrays directly.
*/
@Test
@DisplayName("NodeData accessors expose documented backing arrays")
void nodeDataAccessorsShouldExposeDocumentedBackingArrays() {
final char[] edgeLabels = new char[] { 'a' };
final int[] childNodeIds = new int[] { 7 };
final String[] orderedValues = new String[] { "stem" };
final int[] orderedCounts = new int[] { 3 };
final NodeData<String> nodeData = new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts);
assertSame(edgeLabels, nodeData.edgeLabels());
assertSame(childNodeIds, nodeData.childNodeIds());
assertSame(orderedValues, nodeData.orderedValues());
assertSame(orderedCounts, nodeData.orderedCounts());
}
/**
* Verifies that {@link CompiledNode} rejects mismatched edge and child arrays.
*/
@Test
@DisplayName("CompiledNode rejects mismatched edge and child arrays")
void compiledNodeShouldRejectMismatchedEdgeAndChildArrays() {
@SuppressWarnings("unchecked")
final CompiledNode<String>[] children = new CompiledNode[0];
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new CompiledNode<String>(new char[] { 'a' }, children, new String[0], new int[0]));
assertEquals("edgeLabels and children must have the same length.", exception.getMessage());
}
/**
* Verifies that {@link CompiledNode} rejects mismatched value arrays.
*/
@Test
@DisplayName("CompiledNode rejects mismatched value arrays")
void compiledNodeShouldRejectMismatchedValueArrays() {
@SuppressWarnings("unchecked")
final CompiledNode<String>[] children = new CompiledNode[0];
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new CompiledNode<String>(new char[0], children, new String[] { "stem" }, new int[0]));
assertEquals("orderedValues and orderedCounts must have the same length.", exception.getMessage());
}
/**
* Verifies that {@link CompiledNode} continues to expose the documented backing
* arrays directly.
*/
@Test
@DisplayName("CompiledNode accessors expose documented backing arrays")
void compiledNodeAccessorsShouldExposeDocumentedBackingArrays() {
final char[] edgeLabels = new char[] { 'a' };
@SuppressWarnings("unchecked")
final CompiledNode<String>[] children = new CompiledNode[1];
final String[] orderedValues = new String[] { "stem" };
final int[] orderedCounts = new int[] { 5 };
final CompiledNode<String> node = new CompiledNode<>(edgeLabels, children, orderedValues, orderedCounts);
assertSame(edgeLabels, node.edgeLabels());
assertSame(children, node.children());
assertSame(orderedValues, node.orderedValues());
assertSame(orderedCounts, node.orderedCounts());
}
}

253
tools/generate-pages-badges.py Executable file
View File

@@ -0,0 +1,253 @@
#!/usr/bin/env python3
"""
Generate GitHub Pages badge endpoint JSON files from CI report artifacts.
This script derives compact machine-readable badge payloads from:
- JaCoCo XML coverage report
- PIT mutation testing XML report
- JMH CSV benchmark report
The generated JSON files are intended to be consumed by Shields endpoint badges.
"""
from __future__ import annotations
import argparse
import csv
import json
import os
from pathlib import Path
import xml.etree.ElementTree as ET
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(
description="Generate GitHub Pages badge metadata from build reports."
)
parser.add_argument(
"--jacoco-xml",
required=True,
help="Path to the JaCoCo XML report."
)
parser.add_argument(
"--pit-xml",
required=True,
help="Path to the PIT XML report."
)
parser.add_argument(
"--jmh-csv",
required=True,
help="Path to the JMH CSV report."
)
parser.add_argument(
"--run-metrics-dir",
required=True,
help="Target directory for the current build badge JSON files."
)
parser.add_argument(
"--latest-metrics-dir",
required=True,
help="Target directory for the latest build badge JSON files."
)
return parser.parse_args()
def write_json(target: Path, payload: dict[str, object]) -> None:
"""Write a badge payload as formatted UTF-8 JSON."""
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(json.dumps(payload, indent=2) + os.linesep, encoding="utf-8")
def unavailable_payload(label: str) -> dict[str, object]:
"""Create a standard payload for unavailable metrics."""
return {
"schemaVersion": 1,
"label": label,
"message": "not available",
"color": "lightgrey"
}
def color_for_percentage(value: float) -> str:
"""Select a badge color for a percentage value."""
if value >= 85.0:
return "brightgreen"
if value >= 70.0:
return "green"
if value >= 55.0:
return "yellow"
if value >= 40.0:
return "orange"
return "red"
def color_for_speedup(value: float) -> str:
"""Select a badge color for a speedup factor."""
if value >= 4.0:
return "brightgreen"
if value >= 3.0:
return "green"
if value >= 2.0:
return "yellow"
if value >= 1.0:
return "orange"
return "red"
def coverage_payload(jacoco_xml: Path) -> dict[str, object]:
"""Build a line coverage badge payload from a JaCoCo XML report."""
if not jacoco_xml.is_file():
return unavailable_payload("coverage")
root = ET.parse(jacoco_xml).getroot()
line_counter = None
for counter in root.findall("counter"):
if counter.attrib.get("type") == "LINE":
line_counter = counter
break
if line_counter is None:
return unavailable_payload("coverage")
missed = int(line_counter.attrib.get("missed", "0"))
covered = int(line_counter.attrib.get("covered", "0"))
total = missed + covered
percentage = 0.0 if total == 0 else (100.0 * covered / total)
return {
"schemaVersion": 1,
"label": "coverage",
"message": f"{percentage:.1f}%",
"color": color_for_percentage(percentage)
}
def mutation_payload(pit_xml: Path) -> dict[str, object]:
"""Build a mutation score badge payload from a PIT XML report."""
if not pit_xml.is_file():
return unavailable_payload("mutation")
root = ET.parse(pit_xml).getroot()
mutation_coverage = root.attrib.get("mutationCoverage")
if mutation_coverage is not None:
score = float(mutation_coverage)
else:
detected_statuses = {
"KILLED",
"TIMED_OUT",
"MEMORY_ERROR",
"RUN_ERROR",
"NON_VIABLE"
}
mutations = root.findall("mutation")
total = len(mutations)
detected = sum(
1
for mutation in mutations
if mutation.attrib.get("status") in detected_statuses
)
score = 0.0 if total == 0 else (100.0 * detected / total)
return {
"schemaVersion": 1,
"label": "mutation",
"message": f"{score:.1f}%",
"color": color_for_percentage(score)
}
def parse_family_count(row: dict[str, str]) -> int:
"""Extract the JMH familyCount parameter from a CSV row."""
for key, value in row.items():
if key.startswith("Param: ") and key.endswith("familyCount"):
try:
return int(value)
except (TypeError, ValueError):
return -1
return -1
def benchmark_payload(jmh_csv: Path) -> dict[str, object]:
"""Build a benchmark speedup badge payload from a JMH CSV report."""
if not jmh_csv.is_file():
return unavailable_payload("english benchmark")
with jmh_csv.open("r", encoding="utf-8", newline="") as input_file:
rows = list(csv.DictReader(input_file))
if not rows:
return unavailable_payload("english benchmark")
relevant_rows: list[tuple[int, str, float]] = []
for row in rows:
benchmark = row.get("Benchmark", "")
if not benchmark.endswith(
"EnglishStemmerComparisonBenchmark.radixorUsUkProfiPreferredStem"
) and not benchmark.endswith(
"EnglishStemmerComparisonBenchmark.snowballOriginalPorter"
):
continue
try:
score = float(row["Score"])
except (KeyError, TypeError, ValueError):
continue
relevant_rows.append((parse_family_count(row), benchmark, score))
if not relevant_rows:
return unavailable_payload("english benchmark")
best_family_count = max(family_count for family_count, _, _ in relevant_rows)
radixor_score = None
porter_score = None
for family_count, benchmark, score in relevant_rows:
if family_count != best_family_count:
continue
if benchmark.endswith(".radixorUsUkProfiPreferredStem"):
radixor_score = score
elif benchmark.endswith(".snowballOriginalPorter"):
porter_score = score
if radixor_score is None or porter_score is None or porter_score <= 0.0:
return unavailable_payload("english benchmark")
# score is time for the batch processing, i.e. longer => slower, i.e. speedup is porter/radixor
speedup = porter_score / radixor_score
family_suffix = "" if best_family_count < 0 else f" ({best_family_count})"
return {
"schemaVersion": 1,
"label": "english benchmark",
"message": f"{speedup:.1f}x vs Porter{family_suffix}",
"color": color_for_speedup(speedup)
}
def main() -> int:
"""Generate all requested badge metadata files."""
arguments = parse_args()
jacoco_xml = Path(arguments.jacoco_xml)
pit_xml = Path(arguments.pit_xml)
jmh_csv = Path(arguments.jmh_csv)
run_metrics_dir = Path(arguments.run_metrics_dir)
latest_metrics_dir = Path(arguments.latest_metrics_dir)
payloads = {
"coverage-badge.json": coverage_payload(jacoco_xml),
"pitest-badge.json": mutation_payload(pit_xml),
"jmh-badge.json": benchmark_payload(jmh_csv)
}
for file_name, payload in payloads.items():
write_json(run_metrics_dir / file_name, payload)
write_json(latest_metrics_dir / file_name, payload)
return 0
if __name__ == "__main__":
raise SystemExit(main())

114
tools/generate-release-notes.sh Executable file
View File

@@ -0,0 +1,114 @@
#!/usr/bin/env bash
set -Eeuo pipefail
current_tag="${GITHUB_REF_NAME:-${1:-}}"
if [[ -z "${current_tag}" ]]; then
echo "Current tag is not set. Provide it as GITHUB_REF_NAME or as the first argument." >&2
exit 1
fi
release_prefix="release@"
if [[ "${current_tag}" != "${release_prefix}"* ]]; then
echo "Current tag '${current_tag}' does not start with expected prefix '${release_prefix}'." >&2
exit 1
fi
git fetch --tags --force >/dev/null 2>&1 || true
all_versions="$(git tag --list "${release_prefix}*" | sed "s/^${release_prefix}//" | sort -V)"
previous_tag=""
for version in ${all_versions}; do
if [[ "${release_prefix}${version}" == "${current_tag}" ]]; then
break
fi
previous_tag="${release_prefix}${version}"
done
if [[ -n "${previous_tag}" ]]; then
range="${previous_tag}..${current_tag}"
else
range="${current_tag}"
fi
echo "Generating release notes for range: ${range}" >&2
declare -a CATEGORY_ORDER=(
"feat|Features"
"fix|Bug Fixes"
"perf|Performance"
"refactor|Refactoring"
"docs|Documentation"
"test|Tests"
"build|Build System"
"ci|CI/CD"
"style|Style"
"chore|Maintenance"
"revert|Reverts"
)
declare -A CATEGORY_TITLES
declare -A CATEGORY_ITEMS
for entry in "${CATEGORY_ORDER[@]}"; do
key="${entry%%|*}"
title="${entry##*|}"
CATEGORY_TITLES["${key}"]="${title}"
CATEGORY_ITEMS["${key}"]=""
done
supported_prefix_pattern='^(feat|fix|perf|refactor|docs|test|build|ci|style|chore|revert)(\([^)]+\))?!?:[[:space:]]*(.+)$'
separator=$'\x1f'
append_line() {
local line="$1"
local normalized_line
local category
local message
normalized_line="$(printf '%s' "${line}" | tr -d '\r' | sed 's/^[[:space:]]*//; s/[[:space:]]*$//')"
[[ -z "${normalized_line}" ]] && return 0
if [[ "${normalized_line}" =~ ${supported_prefix_pattern} ]]; then
category="${BASH_REMATCH[1]}"
message="${BASH_REMATCH[3]}"
[[ -z "${message}" ]] && return 0
CATEGORY_ITEMS["${category}"]+="- ${message}"$'\n'
fi
}
while IFS="${separator}" read -r commit_hash subject body; do
[[ -z "${commit_hash}" ]] && continue
if [[ "${subject}" =~ ^Merge[[:space:]] ]] || [[ "${subject}" == "Initial commit" ]]; then
continue
fi
append_line "${subject}"
while IFS= read -r body_line; do
append_line "${body_line}"
done <<< "${body}"
done < <(git log "${range}" --no-merges --pretty=format:"%H${separator}%s${separator}%b")
body_text="## What's New"
for entry in "${CATEGORY_ORDER[@]}"; do
key="${entry%%|*}"
title="${CATEGORY_TITLES[${key}]}"
items="${CATEGORY_ITEMS[${key}]}"
if [[ -n "${items}" ]]; then
body_text+=$'\n\n'"### ${title}"$'\n'
body_text+="$(printf '%s' "${items}" | sed '/^[[:space:]]*$/d')"
fi
done
if [[ "${body_text}" == "## What's New" ]]; then
body_text+=$'\n\n'"No categorized changes were found in commit subjects or bodies for this release range."
fi
printf '%s\n' "${body_text}"