Compare commits
6 Commits
release@0.
...
release@0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
7e1aea72bf
|
|||
|
594abe2c4b
|
|||
|
953ce2226a
|
|||
|
05692726c5
|
|||
|
c18563617d
|
|||
|
436deefd14
|
@@ -3,20 +3,20 @@
|
||||
<classpathentry kind="src" output="bin/main" path="src/main/java">
|
||||
<attributes>
|
||||
<attribute name="gradle_scope" value="main"/>
|
||||
<attribute name="gradle_used_by_scope" value="main,test,jmh"/>
|
||||
<attribute name="gradle_used_by_scope" value="main,test"/>
|
||||
</attributes>
|
||||
</classpathentry>
|
||||
<classpathentry kind="src" output="bin/test" path="src/test/java">
|
||||
<attributes>
|
||||
<attribute name="gradle_scope" value="test"/>
|
||||
<attribute name="gradle_used_by_scope" value="test,jmh"/>
|
||||
<attribute name="gradle_used_by_scope" value="test"/>
|
||||
<attribute name="test" value="true"/>
|
||||
</attributes>
|
||||
</classpathentry>
|
||||
<classpathentry kind="src" output="bin/main" path="src/main/resources">
|
||||
<attributes>
|
||||
<attribute name="gradle_scope" value="main"/>
|
||||
<attribute name="gradle_used_by_scope" value="main,test,jmh"/>
|
||||
<attribute name="gradle_used_by_scope" value="main,test"/>
|
||||
</attributes>
|
||||
</classpathentry>
|
||||
<classpathentry kind="src" output="bin/jmh" path="src/jmh/java">
|
||||
@@ -36,7 +36,7 @@
|
||||
<classpathentry kind="src" output="bin/test" path="src/test/resources">
|
||||
<attributes>
|
||||
<attribute name="gradle_scope" value="test"/>
|
||||
<attribute name="gradle_used_by_scope" value="test,jmh"/>
|
||||
<attribute name="gradle_used_by_scope" value="test"/>
|
||||
<attribute name="test" value="true"/>
|
||||
</attributes>
|
||||
</classpathentry>
|
||||
|
||||
26
.github/workflows/build.yml
vendored
26
.github/workflows/build.yml
vendored
@@ -156,11 +156,31 @@ jobs:
|
||||
test -f gradle.properties
|
||||
test -f gradle/verification-metadata.xml
|
||||
|
||||
- name: Build release distribution, signed Maven bundle, and SBOM
|
||||
- name: Generate release changelog for tagged builds
|
||||
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/release@')
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
chmod +x ./tools/generate-release-notes.sh
|
||||
mkdir -p build/generated/release-notes
|
||||
./tools/generate-release-notes.sh "${GITHUB_REF_NAME}" > build/generated/release-notes/CHANGELOG.md
|
||||
|
||||
- name: Build release inputs, signed Maven bundle, and SBOM
|
||||
env:
|
||||
SIGNING_KEY: ${{ secrets.SIGNING_KEY }}
|
||||
SIGNING_PASSWORD: ${{ secrets.SIGNING_PASSWORD }}
|
||||
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport distZip cyclonedxBom centralBundle
|
||||
run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport cyclonedxBom centralBundle
|
||||
|
||||
- name: Generate release changelog
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
chmod +x ./tools/generate-release-notes.sh
|
||||
mkdir -p build/generated/release-notes
|
||||
./tools/generate-release-notes.sh "${GITHUB_REF_NAME}" > build/generated/release-notes/CHANGELOG.md
|
||||
|
||||
- name: Package release distribution
|
||||
run: ./gradlew --no-daemon distZip
|
||||
|
||||
- name: Publish bundle to Maven Central
|
||||
shell: bash
|
||||
@@ -188,7 +208,7 @@ jobs:
|
||||
- name: Publish GitHub release assets
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
generate_release_notes: true
|
||||
body_path: build/generated/release-notes/CHANGELOG.md
|
||||
files: |
|
||||
build/distributions/*.zip
|
||||
build/reports/sbom/radixor-sbom.json
|
||||
|
||||
31
.github/workflows/pages.yml
vendored
31
.github/workflows/pages.yml
vendored
@@ -17,6 +17,7 @@ on:
|
||||
- 'gradlew'
|
||||
- 'gradlew.bat'
|
||||
- '.github/workflows/pages.yml'
|
||||
- 'tools/generate-pages-badges.py'
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
@@ -83,11 +84,13 @@ jobs:
|
||||
|
||||
SITE_DIR=".gh-pages"
|
||||
RUN_DIR="${SITE_DIR}/builds/${GITHUB_RUN_NUMBER}"
|
||||
RUN_METRICS_DIR="${RUN_DIR}/metrics"
|
||||
LATEST_DIR="${SITE_DIR}/builds/latest"
|
||||
LATEST_METRICS_DIR="${LATEST_DIR}/metrics"
|
||||
|
||||
mkdir -p "${RUN_DIR}"
|
||||
rm -rf "${LATEST_DIR}"
|
||||
mkdir -p "${LATEST_DIR}"
|
||||
mkdir -p "${LATEST_DIR}" "${RUN_METRICS_DIR}" "${LATEST_METRICS_DIR}"
|
||||
|
||||
cp -R build/docs/javadoc "${RUN_DIR}/javadoc"
|
||||
cp -R build/docs/javadoc "${LATEST_DIR}/javadoc"
|
||||
@@ -152,6 +155,26 @@ jobs:
|
||||
SBOM_XML_LATEST_LINK='<li><a href="./builds/latest/sbom/radixor-sbom.xml">SBOM (XML)</a></li>'
|
||||
fi
|
||||
|
||||
python3 \
|
||||
./tools/generate-pages-badges.py \
|
||||
--jacoco-xml build/reports/jacoco/test/jacocoTestReport.xml \
|
||||
--pit-xml build/reports/pitest/mutations.xml \
|
||||
--jmh-csv build/reports/jmh/jmh-results.csv \
|
||||
--run-metrics-dir "${RUN_METRICS_DIR}" \
|
||||
--latest-metrics-dir "${LATEST_METRICS_DIR}"
|
||||
|
||||
COVERAGE_BADGE_LINK='<li><a href="./metrics/coverage-badge.json">Coverage Badge Metadata</a></li>'
|
||||
COVERAGE_BADGE_LATEST_LINK='<li><a href="./builds/latest/metrics/coverage-badge.json">Coverage Badge Metadata</a></li>'
|
||||
MUTATION_BADGE_LINK='<li><a href="./metrics/pitest-badge.json">Mutation Badge Metadata</a></li>'
|
||||
MUTATION_BADGE_LATEST_LINK='<li><a href="./builds/latest/metrics/pitest-badge.json">Mutation Badge Metadata</a></li>'
|
||||
JMH_BADGE_LINK='<li><a href="./metrics/jmh-badge.json">Benchmark Badge Metadata</a></li>'
|
||||
JMH_BADGE_LATEST_LINK='<li><a href="./builds/latest/metrics/jmh-badge.json">Benchmark Badge Metadata</a></li>'
|
||||
|
||||
if [ ! -f "${RUN_METRICS_DIR}/coverage-badge.json" ]; then
|
||||
COVERAGE_BADGE_LINK='<li>Coverage Badge Metadata: not available</li>'
|
||||
COVERAGE_BADGE_LATEST_LINK='<li>Coverage Badge Metadata: not available</li>'
|
||||
fi
|
||||
|
||||
cat > "${RUN_DIR}/index.html" <<EOF
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
@@ -178,6 +201,9 @@ jobs:
|
||||
${DEPENDENCY_CHECK_LINK:-<li>Dependency Vulnerability Report: not available</li>}
|
||||
${SBOM_JSON_LINK:-<li>SBOM (JSON): not available</li>}
|
||||
${SBOM_XML_LINK:-<li>SBOM (XML): not available</li>}
|
||||
${COVERAGE_BADGE_LINK}
|
||||
${MUTATION_BADGE_LINK}
|
||||
${JMH_BADGE_LINK}
|
||||
<li><a href="./pitest/">Mutation Testing Report</a></li>
|
||||
$(
|
||||
[ "${HAS_JMH}" = "true" ] && { echo "${JMH_TXT_LINK:-<li>Benchmark Results (TXT): not available</li>}"; echo "${JMH_CSV_LINK:-<li>Benchmark Results (CSV): not available</li>}"; } \
|
||||
@@ -227,6 +253,9 @@ jobs:
|
||||
${DEPENDENCY_CHECK_LATEST_LINK:-<li>Dependency Vulnerability Report: not currently available</li>}
|
||||
${SBOM_JSON_LATEST_LINK:-<li>SBOM (JSON): not available</li>}
|
||||
${SBOM_XML_LATEST_LINK:-<li>SBOM (XML): not available</li>}
|
||||
${COVERAGE_BADGE_LATEST_LINK}
|
||||
${MUTATION_BADGE_LATEST_LINK}
|
||||
${JMH_BADGE_LATEST_LINK}
|
||||
<li><a href="./builds/latest/pitest/">Mutation Testing Report</a></li>
|
||||
$(
|
||||
[ "${HAS_JMH}" = "true" ] && { echo "${JMH_TXT_LATEST_LINK:-<li>Benchmark Results (TXT): not available</li>}"; echo "${JMH_CSV_LATEST_LINK:-<li>Benchmark Results (CSV): not available</li>}"; } \
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -90,6 +90,9 @@ local.properties
|
||||
# PMD plugin conf
|
||||
.pmd
|
||||
|
||||
# jqwik local db
|
||||
.jqwik-database
|
||||
|
||||
##---------------------------------------------------------------------------------------- Gradle
|
||||
.gradle
|
||||
**/build/
|
||||
|
||||
@@ -2,6 +2,15 @@
|
||||
|
||||
# Radixor
|
||||
|
||||
[](https://github.com/leogalambos/Radixor/actions/workflows/build.yml)
|
||||
[](https://leogalambos.github.io/Radixor/builds/latest/coverage/)
|
||||
[](https://leogalambos.github.io/Radixor/builds/latest/)
|
||||
[](https://leogalambos.github.io/Radixor/builds/latest/pitest/)
|
||||
[](https://leogalambos.github.io/Radixor/builds/latest/jmh/jmh-results.txt)
|
||||
[](https://central.sonatype.com/artifact/org.egothor/radixor)
|
||||
[](LICENSE)
|
||||
[](#)
|
||||
|
||||
*Fast algorithmic stemming with compact patch-command tries — measured at about 4× to 6× the throughput of the Snowball Porter stemmer family on the current English benchmark workload.*
|
||||
|
||||
**Radixor** is a fast, algorithmic stemming toolkit for Java, built around compact **patch-command tries** in the tradition of the original **Egothor** stemmer.
|
||||
|
||||
49
build.gradle
49
build.gradle
@@ -70,6 +70,7 @@ dependencies {
|
||||
|
||||
testImplementation libs.mockito.core
|
||||
testImplementation libs.mockito.junit.jupiter
|
||||
testImplementation libs.jqwik
|
||||
|
||||
mockitoAgent(libs.mockito.core) {
|
||||
transitive = false
|
||||
@@ -187,6 +188,54 @@ pitest {
|
||||
|
||||
application {
|
||||
mainClass = 'org.egothor.stemmer.Compile'
|
||||
applicationName = 'radixor'
|
||||
executableDir = 'bin'
|
||||
}
|
||||
|
||||
distributions {
|
||||
main {
|
||||
distributionBaseName = 'radixor'
|
||||
|
||||
contents {
|
||||
from('README.md') {
|
||||
into ''
|
||||
}
|
||||
|
||||
from('LICENSE') {
|
||||
into ''
|
||||
}
|
||||
|
||||
from('docs') {
|
||||
into 'docs'
|
||||
include 'quick-start.md'
|
||||
include 'cli-compilation.md'
|
||||
include 'dictionary-format.md'
|
||||
include 'built-in-languages.md'
|
||||
include 'programmatic-usage.md'
|
||||
include 'architecture-and-reduction.md'
|
||||
include 'quality-and-operations.md'
|
||||
include 'benchmarking.md'
|
||||
}
|
||||
|
||||
from(layout.buildDirectory.dir('generated/release-notes')) {
|
||||
into ''
|
||||
include 'CHANGELOG.md'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tasks.named('startScripts') {
|
||||
applicationName = 'radixor'
|
||||
}
|
||||
|
||||
tasks.named('distZip', Zip) {
|
||||
archiveBaseName = 'radixor'
|
||||
archiveClassifier = 'bin'
|
||||
}
|
||||
|
||||
tasks.named('distTar') {
|
||||
enabled = false
|
||||
}
|
||||
|
||||
jmh {
|
||||
|
||||
@@ -7,6 +7,11 @@ com.google.code.gson:gson:2.13.2=pmd
|
||||
com.google.errorprone:error_prone_annotations:2.41.0=pmd
|
||||
net.bytebuddy:byte-buddy-agent:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
|
||||
net.bytebuddy:byte-buddy:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
|
||||
net.jqwik:jqwik-api:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
|
||||
net.jqwik:jqwik-engine:1.9.3=jmhRuntimeClasspath,testRuntimeClasspath
|
||||
net.jqwik:jqwik-time:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
|
||||
net.jqwik:jqwik-web:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
|
||||
net.jqwik:jqwik:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
|
||||
net.sf.jopt-simple:jopt-simple:4.9=pitest
|
||||
net.sf.jopt-simple:jopt-simple:5.0.4=jmh,jmhCompileClasspath,jmhRuntimeClasspath
|
||||
net.sf.saxon:Saxon-HE:12.9=pmd
|
||||
@@ -19,7 +24,7 @@ org.apache.commons:commons-lang3:3.18.0=pitest
|
||||
org.apache.commons:commons-lang3:3.20.0=pmd
|
||||
org.apache.commons:commons-math3:3.6.1=jmh,jmhCompileClasspath,jmhRuntimeClasspath
|
||||
org.apache.commons:commons-text:1.14.0=pitest
|
||||
org.apiguardian:apiguardian-api:1.1.2=testCompileClasspath
|
||||
org.apiguardian:apiguardian-api:1.1.2=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath
|
||||
org.checkerframework:checker-qual:3.52.1=pmd
|
||||
org.jacoco:org.jacoco.agent:0.8.14=jacocoAgent,jacocoAnt
|
||||
org.jacoco:org.jacoco.ant:0.8.14=jacocoAnt
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
#
|
||||
# After changing dependency versions:
|
||||
#
|
||||
# unlock temporarily: LockMode.STRICT -> LockMode.LENIENT
|
||||
#
|
||||
# refresh verification metadata:
|
||||
# ./gradlew --write-verification-metadata sha256 test jmh distZip cyclonedxBom
|
||||
#
|
||||
# run:
|
||||
# ./gradlew --write-locks classes testClasses jmh distZip cyclonedxBom
|
||||
#
|
||||
# if needed, refresh verification metadata:
|
||||
# ./gradlew --write-verification-metadata sha256 test jmh distZip cyclonedxBom
|
||||
#
|
||||
# (optional - for Eclipse IDE)
|
||||
# insert trusted-artifacts into gradle/verification-metadata.xml/verification-metadata/configuration:
|
||||
# <trusted-artifacts>
|
||||
@@ -21,6 +23,7 @@
|
||||
[versions]
|
||||
junit = "5.14.3"
|
||||
mockito = "5.23.0"
|
||||
jqwik = "1.9.3"
|
||||
|
||||
[libraries]
|
||||
junit-bom = { module = "org.junit:junit-bom", version.ref = "junit" }
|
||||
@@ -29,3 +32,5 @@ junit-platform-launcher = { module = "org.junit.platform:junit-platform-launcher
|
||||
|
||||
mockito-core = { module = "org.mockito:mockito-core", version.ref = "mockito" }
|
||||
mockito-junit-jupiter = { module = "org.mockito:mockito-junit-jupiter", version.ref = "mockito" }
|
||||
|
||||
jqwik = { module = "net.jqwik:jqwik", version.ref = "jqwik" }
|
||||
|
||||
@@ -131,7 +131,15 @@ tasks.register('centralBundle', Zip) {
|
||||
|
||||
dependsOn(tasks.named('createCentralChecksums'))
|
||||
|
||||
from(centralStagingRepositoryDirectory)
|
||||
from(centralStagingRepositoryDirectory) {
|
||||
exclude '**/maven-metadata*.xml'
|
||||
exclude '**/maven-metadata*.xml.md5'
|
||||
exclude '**/maven-metadata*.xml.sha1'
|
||||
exclude '**/maven-metadata*.xml.asc'
|
||||
exclude '**/maven-metadata*.xml.asc.md5'
|
||||
exclude '**/maven-metadata*.xml.asc.sha1'
|
||||
}
|
||||
|
||||
destinationDirectory = centralBundleDirectory
|
||||
archiveFileName = "radixor-${project.version}-central-bundle.zip"
|
||||
}
|
||||
|
||||
@@ -568,6 +568,46 @@
|
||||
<sha256 value="1af699f8d9ddab67f9a0d202fbd7915eb0362a5a6dfd5ffc54cafa3465c9cb0a" origin="Generated by Gradle"/>
|
||||
</artifact>
|
||||
</component>
|
||||
<component group="net.jqwik" name="jqwik" version="1.9.3">
|
||||
<artifact name="jqwik-1.9.3.jar">
|
||||
<sha256 value="562931e1667308180056a8ce85791f71ab8c37ca8efc2006a163ba5d650e5f73" origin="Generated by Gradle"/>
|
||||
</artifact>
|
||||
<artifact name="jqwik-1.9.3.module">
|
||||
<sha256 value="681316f856db4ea3cac8fcced811127fc1d7016875e5b50aa4a55024513a93d7" origin="Generated by Gradle"/>
|
||||
</artifact>
|
||||
</component>
|
||||
<component group="net.jqwik" name="jqwik-api" version="1.9.3">
|
||||
<artifact name="jqwik-api-1.9.3.jar">
|
||||
<sha256 value="4bce7e80beb6d9d7092a799fa8a509d76cc31dbb20c938a9952965c15d1dd9b2" origin="Generated by Gradle"/>
|
||||
</artifact>
|
||||
<artifact name="jqwik-api-1.9.3.module">
|
||||
<sha256 value="69984416ea2e9f7fde40cfac983d2f540d3a37e9766fd3b0a06fada8f9b4cff2" origin="Generated by Gradle"/>
|
||||
</artifact>
|
||||
</component>
|
||||
<component group="net.jqwik" name="jqwik-engine" version="1.9.3">
|
||||
<artifact name="jqwik-engine-1.9.3.jar">
|
||||
<sha256 value="b85592ee78e30239ccfdca7a134f918ee94ebec51ad29a313fc9a676d97b3ede" origin="Generated by Gradle"/>
|
||||
</artifact>
|
||||
<artifact name="jqwik-engine-1.9.3.module">
|
||||
<sha256 value="2c68479ebda9e334bc9033abd2ef227353808f20114f197947b5c7b9646ab8e5" origin="Generated by Gradle"/>
|
||||
</artifact>
|
||||
</component>
|
||||
<component group="net.jqwik" name="jqwik-time" version="1.9.3">
|
||||
<artifact name="jqwik-time-1.9.3.jar">
|
||||
<sha256 value="9fd09021d8f03d44990457bf3095cf0aaf34d2785d1108ff22590286c233b3e5" origin="Generated by Gradle"/>
|
||||
</artifact>
|
||||
<artifact name="jqwik-time-1.9.3.module">
|
||||
<sha256 value="c2b056576c8767bfcd7efdd982890fbc71e608fb5c9c80fc145cfee6adeeaa24" origin="Generated by Gradle"/>
|
||||
</artifact>
|
||||
</component>
|
||||
<component group="net.jqwik" name="jqwik-web" version="1.9.3">
|
||||
<artifact name="jqwik-web-1.9.3.jar">
|
||||
<sha256 value="6aee9d583c1ff9efe319b2fa0bc9d75fc616de6d1f240ddbd2af9eabda483dbe" origin="Generated by Gradle"/>
|
||||
</artifact>
|
||||
<artifact name="jqwik-web-1.9.3.module">
|
||||
<sha256 value="38c86130c8b86c1657b4f8256e065ee08551f7c5ce728d1a5be8f63133b14554" origin="Generated by Gradle"/>
|
||||
</artifact>
|
||||
</component>
|
||||
<component group="net.sf.jopt-simple" name="jopt-simple" version="4.9">
|
||||
<artifact name="jopt-simple-4.9.jar">
|
||||
<sha256 value="26c5856e954b5f864db76f13b86919b59c6eecf9fd930b96baa8884626baf2f5" origin="Generated by Gradle"/>
|
||||
|
||||
@@ -426,6 +426,8 @@ public final class FrequencyTrie<V> {
|
||||
childNodeIds[edgeIndex] = dataInput.readInt();
|
||||
}
|
||||
|
||||
validateSerializedEdges(nodeIndex, edgeLabels);
|
||||
|
||||
final int valueCount = dataInput.readInt();
|
||||
if (valueCount < 0) {
|
||||
throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
|
||||
@@ -474,6 +476,28 @@ public final class FrequencyTrie<V> {
|
||||
return nodes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates the serialized edge-label sequence for one node.
|
||||
*
|
||||
* <p>
|
||||
* Compiled nodes rely on binary search for child lookup and therefore require
|
||||
* edge labels to be stored in strict ascending order without duplicates.
|
||||
* Rejecting malformed streams here keeps lookup semantics deterministic and
|
||||
* avoids silently constructing a trie whose search behavior would be undefined.
|
||||
*
|
||||
* @param nodeIndex serialized node identifier
|
||||
* @param edgeLabels serialized edge labels
|
||||
* @throws IOException if the edge labels are not strictly ascending
|
||||
*/
|
||||
private static void validateSerializedEdges(final int nodeIndex, final char... edgeLabels) throws IOException {
|
||||
for (int edgeIndex = 1; edgeIndex < edgeLabels.length; edgeIndex++) {
|
||||
if (edgeLabels[edgeIndex - 1] >= edgeLabels[edgeIndex]) {
|
||||
throw new IOException("Edge labels must be strictly ascending at node " + nodeIndex + ", edge index "
|
||||
+ edgeIndex + ": '" + edgeLabels[edgeIndex - 1] + "' then '" + edgeLabels[edgeIndex] + "'.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Locates the compiled node for the supplied key.
|
||||
*
|
||||
|
||||
@@ -117,7 +117,14 @@ public final class PatchCommandEncoder {
|
||||
private static final int MISMATCH_PENALTY = 100;
|
||||
|
||||
/**
|
||||
* Extra headroom added when internal matrices need to grow.
|
||||
* Extra matrix headroom reserved beyond the immediately required dimensions.
|
||||
*
|
||||
* <p>
|
||||
* A small fixed margin reduces repeated reallocation when a caller encodes many
|
||||
* similarly sized terms in sequence. The value is intentionally modest: large
|
||||
* enough to absorb minor size fluctuations, yet small enough to avoid
|
||||
* materially over-allocating the reused dynamic-programming matrices.
|
||||
* </p>
|
||||
*/
|
||||
private static final int CAPACITY_MARGIN = 8;
|
||||
|
||||
@@ -288,6 +295,7 @@ public final class PatchCommandEncoder {
|
||||
* @param patchCommand compact patch command
|
||||
* @return transformed word, or {@code null} when {@code source} is {@code null}
|
||||
*/
|
||||
@SuppressWarnings({ "PMD.CyclomaticComplexity", "PMD.AvoidLiteralsInIfCondition" })
|
||||
public static String apply(String source, String patchCommand) {
|
||||
if (source == null) {
|
||||
return null;
|
||||
@@ -299,6 +307,10 @@ public final class PatchCommandEncoder {
|
||||
return source;
|
||||
}
|
||||
|
||||
if ((patchCommand.length() & 1) != 0) {
|
||||
return source;
|
||||
}
|
||||
|
||||
StringBuilder result = new StringBuilder(source);
|
||||
|
||||
if (result.isEmpty()) {
|
||||
@@ -312,11 +324,14 @@ public final class PatchCommandEncoder {
|
||||
|
||||
char opcode = patchCommand.charAt(patchIndex);
|
||||
char argument = patchCommand.charAt(patchIndex + 1);
|
||||
int encodedCount = argument - 'a' + 1;
|
||||
|
||||
switch (opcode) {
|
||||
case SKIP_OPCODE:
|
||||
position = position - encodedCount + 1;
|
||||
final int skipCount = decodeEncodedCount(argument);
|
||||
if (skipCount < 1) {
|
||||
return source;
|
||||
}
|
||||
position = position - skipCount + 1;
|
||||
break;
|
||||
|
||||
case REPLACE_OPCODE:
|
||||
@@ -324,8 +339,12 @@ public final class PatchCommandEncoder {
|
||||
break;
|
||||
|
||||
case DELETE_OPCODE:
|
||||
final int deleteCount = decodeEncodedCount(argument);
|
||||
if (deleteCount < 1) {
|
||||
return source;
|
||||
}
|
||||
int deleteEndExclusive = position + 1;
|
||||
position -= encodedCount - 1;
|
||||
position -= deleteCount - 1;
|
||||
result.delete(position, deleteEndExclusive);
|
||||
break;
|
||||
|
||||
@@ -353,6 +372,26 @@ public final class PatchCommandEncoder {
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes a compact count argument used by skip and delete instructions.
|
||||
*
|
||||
* <p>
|
||||
* Valid encoded counts start at {@code 'a'} for one affected character. Values
|
||||
* below {@code 'a'} are malformed and are reported to callers via the
|
||||
* compatibility fallback path rather than by throwing a dedicated exception.
|
||||
* </p>
|
||||
*
|
||||
* @param argument serialized count argument
|
||||
* @return decoded positive count, or {@code -1} when the argument is malformed
|
||||
*/
|
||||
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
|
||||
private static int decodeEncodedCount(final char argument) {
|
||||
if (argument < 'a') {
|
||||
return -1;
|
||||
}
|
||||
return argument - 'a' + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies a patch command to an empty source word.
|
||||
*
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Immutable compiled trie node optimized for read access.
|
||||
@@ -38,7 +39,9 @@ import java.util.Arrays;
|
||||
* <p>
|
||||
* The returned arrays are the internal backing storage of the compiled node.
|
||||
* They are exposed for efficient access by closely related trie infrastructure
|
||||
* and therefore must never be modified by callers.
|
||||
* and therefore must never be modified by callers. The node itself is still
|
||||
* immutable from the public API perspective because construction wires these
|
||||
* arrays once and all lookup operations thereafter treat them as read-only.
|
||||
*
|
||||
* @param <V> value type
|
||||
* @param edgeLabels internal edge label array
|
||||
@@ -46,8 +49,90 @@ import java.util.Arrays;
|
||||
* @param orderedValues internal ordered values array
|
||||
* @param orderedCounts internal ordered counts array
|
||||
*/
|
||||
@SuppressWarnings("PMD.DataClass")
|
||||
public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[] orderedValues, int... orderedCounts) {
|
||||
|
||||
/**
|
||||
* Creates one validated compiled node.
|
||||
*
|
||||
* @throws NullPointerException if any array argument is {@code null}
|
||||
* @throws IllegalArgumentException if the edge-related arrays or value-related
|
||||
* arrays do not have matching lengths
|
||||
*/
|
||||
public CompiledNode {
|
||||
Objects.requireNonNull(edgeLabels, "edgeLabels");
|
||||
Objects.requireNonNull(children, "children");
|
||||
Objects.requireNonNull(orderedValues, "orderedValues");
|
||||
Objects.requireNonNull(orderedCounts, "orderedCounts");
|
||||
|
||||
if (edgeLabels.length != children.length) {
|
||||
throw new IllegalArgumentException("edgeLabels and children must have the same length.");
|
||||
}
|
||||
if (orderedValues.length != orderedCounts.length) {
|
||||
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal edge-label array.
|
||||
*
|
||||
* <p>
|
||||
* The returned array is not copied for performance reasons and must be treated
|
||||
* as read-only.
|
||||
*
|
||||
* @return internal edge-label array
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public char[] edgeLabels() {
|
||||
return this.edgeLabels;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal child-node array.
|
||||
*
|
||||
* <p>
|
||||
* The returned array is not copied for performance reasons and must be treated
|
||||
* as read-only by external callers.
|
||||
*
|
||||
* @return internal child-node array
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public CompiledNode<V>[] children() {
|
||||
return this.children;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal ordered-values array.
|
||||
*
|
||||
* <p>
|
||||
* The returned array is not copied for performance reasons and must be treated
|
||||
* as read-only.
|
||||
*
|
||||
* @return internal ordered-values array
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public V[] orderedValues() {
|
||||
return this.orderedValues;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal ordered-counts array.
|
||||
*
|
||||
* <p>
|
||||
* The returned array is not copied for performance reasons and must be treated
|
||||
* as read-only.
|
||||
*
|
||||
* @return internal ordered-counts array
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public int[] orderedCounts() {
|
||||
return this.orderedCounts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds a child for the supplied edge character.
|
||||
*
|
||||
|
||||
@@ -30,14 +30,18 @@
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Intermediate node data used during deserialization before child references
|
||||
* are resolved.
|
||||
*
|
||||
* <p>
|
||||
* The arrays exposed by the accessors are the internal backing storage of this
|
||||
* holder. They are returned directly for efficiency and therefore must be
|
||||
* treated as read-only by callers.
|
||||
* holder. They are returned directly for efficiency because the deserialization
|
||||
* pipeline copies references into immutable compiled nodes immediately after
|
||||
* the record is created. Callers must therefore treat every returned array as
|
||||
* read-only.
|
||||
*
|
||||
* @param <V> value type
|
||||
* @param edgeLabels edge labels
|
||||
@@ -45,6 +49,87 @@ package org.egothor.stemmer.trie;
|
||||
* @param orderedValues ordered values
|
||||
* @param orderedCounts ordered counts
|
||||
*/
|
||||
@SuppressWarnings("PMD.DataClass")
|
||||
public record NodeData<V>(char[] edgeLabels, int[] childNodeIds, V[] orderedValues, int... orderedCounts) {
|
||||
/**
|
||||
* Creates one validated node-data holder.
|
||||
*
|
||||
* @throws NullPointerException if any array argument is {@code null}
|
||||
* @throws IllegalArgumentException if the edge-related arrays or value-related
|
||||
* arrays do not have matching lengths
|
||||
*/
|
||||
public NodeData {
|
||||
Objects.requireNonNull(edgeLabels, "edgeLabels");
|
||||
Objects.requireNonNull(childNodeIds, "childNodeIds");
|
||||
Objects.requireNonNull(orderedValues, "orderedValues");
|
||||
Objects.requireNonNull(orderedCounts, "orderedCounts");
|
||||
|
||||
if (edgeLabels.length != childNodeIds.length) {
|
||||
throw new IllegalArgumentException("edgeLabels and childNodeIds must have the same length.");
|
||||
}
|
||||
if (orderedValues.length != orderedCounts.length) {
|
||||
throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal edge-label array.
|
||||
*
|
||||
* <p>
|
||||
* The returned array is not copied for performance reasons and must be treated
|
||||
* as read-only.
|
||||
*
|
||||
* @return internal edge-label array
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public char[] edgeLabels() {
|
||||
return this.edgeLabels;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal child-node identifier array.
|
||||
*
|
||||
* <p>
|
||||
* The returned array is not copied for performance reasons and must be treated
|
||||
* as read-only.
|
||||
*
|
||||
* @return internal child-node identifier array
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public int[] childNodeIds() {
|
||||
return this.childNodeIds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal ordered-values array.
|
||||
*
|
||||
* <p>
|
||||
* The returned array is not copied for performance reasons and must be treated
|
||||
* as read-only.
|
||||
*
|
||||
* @return internal ordered-values array
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public V[] orderedValues() {
|
||||
return this.orderedValues;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal ordered-counts array.
|
||||
*
|
||||
* <p>
|
||||
* The returned array is not copied for performance reasons and must be treated
|
||||
* as read-only.
|
||||
*
|
||||
* @return internal ordered-counts array
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("PMD.MethodReturnsInternalArray")
|
||||
public int[] orderedCounts() {
|
||||
return this.orderedCounts;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
218
src/test/java/org/egothor/stemmer/FrequencyTrieProperties.java
Normal file
218
src/test/java/org/egothor/stemmer/FrequencyTrieProperties.java
Normal file
@@ -0,0 +1,218 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertIterableEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.List;
|
||||
|
||||
import net.jqwik.api.ForAll;
|
||||
import net.jqwik.api.Label;
|
||||
import net.jqwik.api.Property;
|
||||
import net.jqwik.api.Tag;
|
||||
|
||||
/**
|
||||
* Property-based tests for the compiled trie abstraction.
|
||||
*
|
||||
* <p>
|
||||
* These properties focus on deterministic compilation, observable lookup
|
||||
* alignment, binary persistence stability, and safe reconstruction back into a
|
||||
* writable builder. Together they guard the most valuable invariants of the
|
||||
* core algorithm without overfitting to particular fixture data.
|
||||
*/
|
||||
@Label("FrequencyTrie properties")
|
||||
@Tag("unit")
|
||||
@Tag("property")
|
||||
@Tag("trie")
|
||||
class FrequencyTrieProperties extends PropertyBasedTestSupport {
|
||||
|
||||
/**
|
||||
* Binary codec used by generic trie round-trip assertions.
|
||||
*/
|
||||
private static final FrequencyTrie.ValueStreamCodec<String> STRING_CODEC = new FrequencyTrie.ValueStreamCodec<>() {
|
||||
|
||||
@Override
|
||||
public void write(final DataOutputStream dataOutput, final String value) throws IOException {
|
||||
dataOutput.writeUTF(value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String read(final DataInputStream dataInput) throws IOException {
|
||||
return dataInput.readUTF();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Verifies that compiling the same insertion scenario repeatedly yields the
|
||||
* same observable lookups.
|
||||
*
|
||||
* @param scenario generated trie scenario
|
||||
* @param reductionMode reduction mode
|
||||
*/
|
||||
@Property(tries = 80)
|
||||
@Label("compilation should be deterministic for the same insertion scenario")
|
||||
void compilationShouldBeDeterministicForTheSameInsertionScenario(
|
||||
@ForAll("trieScenarios") final TrieScenario scenario, @ForAll final ReductionMode reductionMode) {
|
||||
final FrequencyTrie<String> first = buildTrie(scenario, reductionMode);
|
||||
final FrequencyTrie<String> second = buildTrie(scenario, reductionMode);
|
||||
|
||||
for (String key : scenario.observedKeys()) {
|
||||
assertTrieStateEquals(first, second, key);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that {@link FrequencyTrie#get(String)},
|
||||
* {@link FrequencyTrie#getAll(String)}, and
|
||||
* {@link FrequencyTrie#getEntries(String)} remain aligned for every probed key.
|
||||
*
|
||||
* @param scenario generated trie scenario
|
||||
* @param reductionMode reduction mode
|
||||
*/
|
||||
@Property(tries = 80)
|
||||
@Label("get, getAll, and getEntries should stay semantically aligned")
|
||||
void getGetAllAndGetEntriesShouldStaySemanticallyAligned(@ForAll("trieScenarios") final TrieScenario scenario,
|
||||
@ForAll final ReductionMode reductionMode) {
|
||||
final FrequencyTrie<String> trie = buildTrie(scenario, reductionMode);
|
||||
|
||||
for (String key : scenario.observedKeys()) {
|
||||
final String preferred = trie.get(key);
|
||||
final String[] allValues = trie.getAll(key);
|
||||
final List<ValueCount<String>> entries = trie.getEntries(key);
|
||||
|
||||
assertEquals(allValues.length, entries.size(), "getAll() and getEntries() must have equal cardinality.");
|
||||
|
||||
if (allValues.length == 0) {
|
||||
assertNull(preferred, "get() must return null when no terminal value exists.");
|
||||
assertTrue(entries.isEmpty(), "getEntries() must be empty when getAll() is empty.");
|
||||
continue;
|
||||
}
|
||||
|
||||
assertEquals(allValues[0], preferred, "get() must expose the preferred first getAll() value.");
|
||||
|
||||
int previousCount = Integer.MAX_VALUE;
|
||||
for (int index = 0; index < entries.size(); index++) {
|
||||
final ValueCount<String> entry = entries.get(index);
|
||||
assertEquals(allValues[index], entry.value(), "entry ordering must match getAll() ordering.");
|
||||
assertTrue(entry.count() >= 1, "stored frequencies must remain positive.");
|
||||
assertTrue(entry.count() <= previousCount, "entry counts must be ordered descending.");
|
||||
previousCount = entry.count();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that binary serialization and deserialization preserve all
|
||||
* observable lookup semantics for generated scenarios.
|
||||
*
|
||||
* @param scenario generated trie scenario
|
||||
* @param reductionMode reduction mode
|
||||
*/
|
||||
@Property(tries = 40)
|
||||
@Label("binary round-trip should preserve observable trie semantics")
|
||||
void binaryRoundTripShouldPreserveObservableTrieSemantics(@ForAll("trieScenarios") final TrieScenario scenario,
|
||||
@ForAll final ReductionMode reductionMode) {
|
||||
final FrequencyTrie<String> original = buildTrie(scenario, reductionMode);
|
||||
final FrequencyTrie<String> roundTripped = roundTrip(original);
|
||||
|
||||
for (String key : scenario.observedKeys()) {
|
||||
assertTrieStateEquals(original, roundTripped, key);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that reconstructing a writable builder from a compiled trie and
|
||||
* recompiling it preserves observable lookup semantics.
|
||||
*
|
||||
* @param scenario generated trie scenario
|
||||
* @param reductionMode reduction mode
|
||||
*/
|
||||
@Property(tries = 60)
|
||||
@Label("builder reconstruction should preserve observable trie semantics")
|
||||
void builderReconstructionShouldPreserveObservableTrieSemantics(
|
||||
@ForAll("trieScenarios") final TrieScenario scenario, @ForAll final ReductionMode reductionMode) {
|
||||
final FrequencyTrie<String> original = buildTrie(scenario, reductionMode);
|
||||
final FrequencyTrie<String> rebuilt = FrequencyTrieBuilders
|
||||
.copyOf(original, STRING_ARRAY_FACTORY, reductionMode).build();
|
||||
|
||||
for (String key : scenario.observedKeys()) {
|
||||
assertEquals(original.get(key), rebuilt.get(key), "preferred lookup must survive reconstruction.");
|
||||
assertArrayEquals(original.getAll(key), rebuilt.getAll(key),
|
||||
"complete ordered result set must survive reconstruction.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Asserts full observable trie equality for one key.
|
||||
*
|
||||
* @param expected expected trie
|
||||
* @param actual actual trie
|
||||
* @param key key to probe
|
||||
*/
|
||||
private static void assertTrieStateEquals(final FrequencyTrie<String> expected, final FrequencyTrie<String> actual,
|
||||
final String key) {
|
||||
assertEquals(expected.get(key), actual.get(key), "preferred lookup drifted.");
|
||||
assertArrayEquals(expected.getAll(key), actual.getAll(key), "ordered result set drifted.");
|
||||
assertIterableEquals(expected.getEntries(key), actual.getEntries(key), "entry list drifted.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Round-trips one trie through its binary representation.
|
||||
*
|
||||
* @param trie trie to persist and reload
|
||||
* @return reloaded trie
|
||||
*/
|
||||
private static FrequencyTrie<String> roundTrip(final FrequencyTrie<String> trie) {
|
||||
try {
|
||||
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||
try (DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream)) {
|
||||
trie.writeTo(dataOutputStream, STRING_CODEC);
|
||||
}
|
||||
|
||||
try (DataInputStream dataInputStream = new DataInputStream(
|
||||
new ByteArrayInputStream(byteArrayOutputStream.toByteArray()))) {
|
||||
return FrequencyTrie.readFrom(dataInputStream, STRING_ARRAY_FACTORY, STRING_CODEC);
|
||||
}
|
||||
} catch (IOException exception) {
|
||||
throw new UncheckedIOException("Unexpected binary round-trip failure.", exception);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -733,6 +733,30 @@ class FrequencyTrieTest {
|
||||
assertTrue(exception.getMessage().contains("Invalid root node id"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that deserialization rejects unsorted or duplicate serialized edge
|
||||
* labels because compiled lookup relies on binary search over a strictly
|
||||
* ascending edge array.
|
||||
*/
|
||||
@Test
|
||||
@Tag("persistence")
|
||||
@DisplayName("readFrom rejects non-ascending serialized edge labels")
|
||||
void readFromRejectsNonAscendingSerializedEdgeLabels() {
|
||||
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
|
||||
dataOutput.writeInt(2);
|
||||
dataOutput.writeChar('b');
|
||||
dataOutput.writeInt(0);
|
||||
dataOutput.writeChar('a');
|
||||
dataOutput.writeInt(0);
|
||||
dataOutput.writeInt(0);
|
||||
} });
|
||||
|
||||
final IOException exception = assertThrows(IOException.class,
|
||||
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
|
||||
|
||||
assertTrue(exception.getMessage().contains("Edge labels must be strictly ascending"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that deserialization rejects non-positive stored counts.
|
||||
*/
|
||||
|
||||
@@ -0,0 +1,308 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertAll;
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertIterableEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Set;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
|
||||
/**
|
||||
* Deterministic fuzz-style tests for trie compilation and generated stemming
|
||||
* dictionaries.
|
||||
*
|
||||
* <p>
|
||||
* These tests exercise bounded pseudo-random inputs with fixed seeds. The suite
|
||||
* focuses on invariants that are meaningful for CI: compilation must remain
|
||||
* stable, lookups must remain deterministic, binary round-trips must preserve
|
||||
* observable behavior, and generated patch commands must reconstruct one of the
|
||||
* stems declared by the source dictionary.
|
||||
*/
|
||||
@DisplayName("Deterministic fuzz-style trie and stemmer compilation")
|
||||
@Tag("unit")
|
||||
@Tag("fuzz")
|
||||
@Tag("trie")
|
||||
@Tag("stemming")
|
||||
class FuzzStemmerAndTrieCompilationTest {
|
||||
|
||||
/**
|
||||
* Shared array factory used by generated tries.
|
||||
*/
|
||||
private static final IntFunction<String[]> ARRAY_FACTORY = String[]::new;
|
||||
|
||||
/**
|
||||
* Binary codec used for generic trie round-trip assertions.
|
||||
*/
|
||||
private static final FrequencyTrie.ValueStreamCodec<String> STRING_CODEC = new FrequencyTrie.ValueStreamCodec<String>() {
|
||||
|
||||
@Override
|
||||
public void write(final DataOutputStream dataOutput, final String value) throws IOException {
|
||||
dataOutput.writeUTF(value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String read(final DataInputStream dataInput) throws IOException {
|
||||
return dataInput.readUTF();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Temporary directory for generated dictionaries and binary artifacts.
|
||||
*/
|
||||
@TempDir
|
||||
Path temporaryDirectory;
|
||||
|
||||
/**
|
||||
* Verifies that bounded pseudo-random trie insertions compile deterministically
|
||||
* and preserve observable semantics across rebuild, binary serialization, and
|
||||
* builder reconstruction.
|
||||
*
|
||||
* @throws IOException if an unexpected binary I/O failure occurs
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("generated trie insertions should preserve semantics across compilation forms")
|
||||
void generatedTrieInsertionsShouldPreserveSemanticsAcrossCompilationForms() throws IOException {
|
||||
for (ReductionMode reductionMode : ReductionMode.values()) {
|
||||
final ReductionSettings reductionSettings = ReductionSettings.withDefaults(reductionMode);
|
||||
for (FuzzTestSupport.TrieCompilationScenario scenario : FuzzTestSupport.trieCompilationScenarios()
|
||||
.toList()) {
|
||||
final FrequencyTrie<String> compiled = buildTrie(scenario, reductionSettings);
|
||||
final FrequencyTrie<String> rebuilt = buildTrie(scenario, reductionSettings);
|
||||
final FrequencyTrie<String> roundTripped = roundTrip(compiled);
|
||||
final FrequencyTrie<String> reconstructed = FrequencyTrieBuilders.copyOf(compiled, ARRAY_FACTORY,
|
||||
reductionSettings).build();
|
||||
|
||||
for (String key : scenario.observedKeys()) {
|
||||
assertTrieStateEquals(compiled, rebuilt, key,
|
||||
describeScenario("repeated compilation drifted", reductionMode, scenario, key));
|
||||
assertTrieStateEquals(compiled, roundTripped, key,
|
||||
describeScenario("binary round-trip drifted", reductionMode, scenario, key));
|
||||
assertTrieLookupSemanticsEqual(compiled, reconstructed, key,
|
||||
describeScenario("builder reconstruction drifted", reductionMode, scenario, key));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that generated dictionaries compile without failure and that the
|
||||
* preferred patch command for each generated word reconstructs one acceptable
|
||||
* source stem.
|
||||
*
|
||||
* @throws IOException if the generated dictionary cannot be written or read
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("generated dictionaries should compile and stem consistently")
|
||||
void generatedDictionariesShouldCompileAndStemConsistently() throws IOException {
|
||||
for (ReductionMode reductionMode : ReductionMode.values()) {
|
||||
for (FuzzTestSupport.StemmerDictionaryScenario scenario : FuzzTestSupport.stemmerDictionaryScenarios()
|
||||
.toList()) {
|
||||
final Path dictionaryFile = this.temporaryDirectory
|
||||
.resolve("fuzz-dictionary-" + reductionMode.name() + "-" + scenario.seed() + ".txt");
|
||||
Files.writeString(dictionaryFile, scenario.dictionaryContent(), StandardCharsets.UTF_8);
|
||||
|
||||
final FrequencyTrie<String> trie = assertDoesNotThrow(
|
||||
() -> StemmerPatchTrieLoader.load(dictionaryFile, true, reductionMode),
|
||||
describeScenario("generated dictionary must compile", reductionMode, scenario, null));
|
||||
|
||||
for (String word : scenario.expectedStemsByWord().keySet()) {
|
||||
final Set<String> acceptableStems = scenario.expectedStemsByWord().get(word);
|
||||
final String preferredPatch = trie.get(word);
|
||||
final String[] allPatches = trie.getAll(word);
|
||||
|
||||
assertAll(
|
||||
() -> assertTrue(preferredPatch != null && !preferredPatch.isEmpty(),
|
||||
describeScenario("preferred patch must exist", reductionMode, scenario, word)),
|
||||
() -> assertTrue(allPatches.length >= 1,
|
||||
describeScenario("at least one patch must exist", reductionMode, scenario, word)),
|
||||
() -> assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(word, preferredPatch)),
|
||||
describeScenario("preferred patch reconstructed an unexpected stem",
|
||||
reductionMode, scenario, word)),
|
||||
() -> assertTrue(allPatchesProduceOnlyAcceptableStems(word, allPatches, acceptableStems),
|
||||
describeScenario("getAll() contained a patch outside the accepted stem set",
|
||||
reductionMode, scenario, word)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that binary persistence of generated stemmer tries preserves all
|
||||
* observable lookups for the generated vocabulary.
|
||||
*
|
||||
* @throws IOException if persistence unexpectedly fails
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("generated stemmer tries should survive binary persistence")
|
||||
void generatedStemmerTriesShouldSurviveBinaryPersistence() throws IOException {
|
||||
for (FuzzTestSupport.StemmerDictionaryScenario scenario : FuzzTestSupport.stemmerDictionaryScenarios()
|
||||
.toList()) {
|
||||
final Path dictionaryFile = this.temporaryDirectory.resolve("binary-fuzz-" + scenario.seed() + ".txt");
|
||||
final Path binaryFile = this.temporaryDirectory.resolve("binary-fuzz-" + scenario.seed() + ".dat.gz");
|
||||
|
||||
Files.writeString(dictionaryFile, scenario.dictionaryContent(), StandardCharsets.UTF_8);
|
||||
|
||||
final FrequencyTrie<String> original = StemmerPatchTrieLoader.load(dictionaryFile, true,
|
||||
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
|
||||
StemmerPatchTrieLoader.saveBinary(original, binaryFile);
|
||||
final FrequencyTrie<String> reloaded = StemmerPatchTrieLoader.loadBinary(binaryFile);
|
||||
|
||||
for (String word : scenario.expectedStemsByWord().keySet()) {
|
||||
assertTrieStateEquals(original, reloaded, word,
|
||||
"Binary stemmer round-trip drifted for seed=" + scenario.seed() + ", word='" + word + "'.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds one trie from the supplied generated scenario.
|
||||
*
|
||||
* @param scenario generated scenario
|
||||
* @param reductionSettings reduction settings
|
||||
* @return compiled trie
|
||||
*/
|
||||
private static FrequencyTrie<String> buildTrie(final FuzzTestSupport.TrieCompilationScenario scenario,
|
||||
final ReductionSettings reductionSettings) {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(ARRAY_FACTORY, reductionSettings);
|
||||
for (FuzzTestSupport.TrieInsertion insertion : scenario.insertions()) {
|
||||
builder.put(insertion.key(), insertion.value(), insertion.count());
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs a generic binary round-trip of a compiled trie.
|
||||
*
|
||||
* @param trie source trie
|
||||
* @return deserialized trie
|
||||
* @throws IOException if persistence fails
|
||||
*/
|
||||
private static FrequencyTrie<String> roundTrip(final FrequencyTrie<String> trie) throws IOException {
|
||||
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
trie.writeTo(outputStream, STRING_CODEC);
|
||||
return FrequencyTrie.readFrom(new ByteArrayInputStream(outputStream.toByteArray()), ARRAY_FACTORY, STRING_CODEC);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares all observable lookup views for one key.
|
||||
*
|
||||
* @param expected reference trie
|
||||
* @param actual candidate trie
|
||||
* @param key key to inspect
|
||||
* @param failureMessage assertion message
|
||||
*/
|
||||
private static void assertTrieStateEquals(final FrequencyTrie<String> expected, final FrequencyTrie<String> actual,
|
||||
final String key, final String failureMessage) {
|
||||
assertAll(
|
||||
() -> assertEquals(expected.get(key), actual.get(key), failureMessage),
|
||||
() -> assertArrayEquals(expected.getAll(key), actual.getAll(key), failureMessage),
|
||||
() -> assertIterableEquals(expected.getEntries(key), actual.getEntries(key), failureMessage));
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares only lookup semantics that are expected to survive reconstruction
|
||||
* from a reduced compiled trie.
|
||||
*
|
||||
* <p>
|
||||
* Some reduction modes intentionally ignore absolute local frequencies when
|
||||
* identifying equivalent subtrees. Reconstructing a mutable builder from the
|
||||
* reduced compiled form and compiling it again must therefore preserve
|
||||
* observable lookup semantics, but it does not necessarily preserve original
|
||||
* local counts reported by {@link FrequencyTrie#getEntries(String)}.
|
||||
*
|
||||
* @param expected reference trie
|
||||
* @param actual candidate trie
|
||||
* @param key key to inspect
|
||||
* @param failureMessage assertion message
|
||||
*/
|
||||
private static void assertTrieLookupSemanticsEqual(final FrequencyTrie<String> expected,
|
||||
final FrequencyTrie<String> actual, final String key, final String failureMessage) {
|
||||
assertAll(
|
||||
() -> assertEquals(expected.get(key), actual.get(key), failureMessage),
|
||||
() -> assertArrayEquals(expected.getAll(key), actual.getAll(key), failureMessage));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that every patch in the array reconstructs one acceptable stem.
|
||||
*
|
||||
* @param word original surface form
|
||||
* @param patches patch commands
|
||||
* @param acceptableStems acceptable stems
|
||||
* @return {@code true} when all patches are acceptable
|
||||
*/
|
||||
private static boolean allPatchesProduceOnlyAcceptableStems(final String word, final String[] patches,
|
||||
final Set<String> acceptableStems) {
|
||||
for (String patch : patches) {
|
||||
if (!acceptableStems.contains(PatchCommandEncoder.apply(word, patch))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a contextual assertion message.
|
||||
*
|
||||
* @param prefix failure prefix
|
||||
* @param reductionMode reduction mode under test
|
||||
* @param scenario source scenario
|
||||
* @param word current word or key, may be {@code null}
|
||||
* @return contextual message
|
||||
*/
|
||||
private static String describeScenario(final String prefix, final ReductionMode reductionMode, final Object scenario,
|
||||
final String word) {
|
||||
final StringBuilder builder = new StringBuilder(128);
|
||||
builder.append(prefix).append(". reductionMode=").append(reductionMode).append(", scenario=")
|
||||
.append(scenario);
|
||||
if (word != null) {
|
||||
builder.append(", token='").append(word).append('\'');
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
}
|
||||
339
src/test/java/org/egothor/stemmer/FuzzTestSupport.java
Normal file
339
src/test/java/org/egothor/stemmer/FuzzTestSupport.java
Normal file
@@ -0,0 +1,339 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* Deterministic support utilities for fuzz-style tests of trie compilation and
|
||||
* stemming dictionary loading.
|
||||
*
|
||||
* <p>
|
||||
* The generators in this helper intentionally use bounded input sizes and fixed
|
||||
* seeds so that the resulting tests remain reproducible and suitable for CI.
|
||||
* The goal is not statistical randomness, but broad structured coverage of
|
||||
* unusual combinations that are cumbersome to author manually.
|
||||
*/
|
||||
final class FuzzTestSupport {
|
||||
|
||||
/**
|
||||
* Shared deterministic seeds used across all generated scenarios.
|
||||
*/
|
||||
private static final long[] SEEDS = { 7L, 19L, 43L, 71L, 101L, 211L };
|
||||
|
||||
/**
|
||||
* Lower-case alphabet used for generated word material.
|
||||
*/
|
||||
private static final char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz".toCharArray();
|
||||
|
||||
/**
|
||||
* Utility class.
|
||||
*/
|
||||
private FuzzTestSupport() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns deterministic trie-compilation scenarios.
|
||||
*
|
||||
* @return stream of bounded deterministic scenarios
|
||||
*/
|
||||
static Stream<TrieCompilationScenario> trieCompilationScenarios() {
|
||||
final List<TrieCompilationScenario> scenarios = new ArrayList<>(SEEDS.length);
|
||||
for (long seed : SEEDS) {
|
||||
scenarios.add(createTrieCompilationScenario(seed));
|
||||
}
|
||||
return scenarios.stream();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns deterministic stemmer-dictionary scenarios.
|
||||
*
|
||||
* @return stream of bounded deterministic scenarios
|
||||
*/
|
||||
static Stream<StemmerDictionaryScenario> stemmerDictionaryScenarios() {
|
||||
final List<StemmerDictionaryScenario> scenarios = new ArrayList<>(SEEDS.length);
|
||||
for (long seed : SEEDS) {
|
||||
scenarios.add(createStemmerDictionaryScenario(seed));
|
||||
}
|
||||
return scenarios.stream();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates one trie scenario with repeated insertions, empty-key coverage, and a
|
||||
* stable set of observed keys.
|
||||
*
|
||||
* @param seed deterministic seed
|
||||
* @return generated scenario
|
||||
*/
|
||||
private static TrieCompilationScenario createTrieCompilationScenario(final long seed) {
|
||||
final Random random = new Random(seed);
|
||||
final List<TrieInsertion> insertions = new ArrayList<>();
|
||||
final Set<String> observedKeys = new LinkedHashSet<>();
|
||||
|
||||
observedKeys.add("");
|
||||
|
||||
final int insertionCount = 50 + random.nextInt(15);
|
||||
for (int index = 0; index < insertionCount; index++) {
|
||||
final String key = random.nextInt(8) == 0 ? "" : nextWord(random, 1, 10);
|
||||
final String value = nextWord(random, 0, 8);
|
||||
final int count = 1 + random.nextInt(4);
|
||||
|
||||
insertions.add(new TrieInsertion(key, value, count));
|
||||
observedKeys.add(key);
|
||||
|
||||
if (!key.isEmpty() && random.nextBoolean()) {
|
||||
observedKeys.add(key.substring(0, Math.max(0, key.length() - 1)));
|
||||
}
|
||||
observedKeys.add(nextWord(random, 1, 8));
|
||||
}
|
||||
|
||||
return new TrieCompilationScenario(seed, List.copyOf(insertions), List.copyOf(observedKeys));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates one dictionary scenario made of compact stem-to-variants groups.
|
||||
*
|
||||
* @param seed deterministic seed
|
||||
* @return generated scenario
|
||||
*/
|
||||
private static StemmerDictionaryScenario createStemmerDictionaryScenario(final long seed) {
|
||||
final Random random = new Random(seed);
|
||||
final Map<String, Set<String>> expectedStemsByWord = new LinkedHashMap<>();
|
||||
final StringBuilder dictionary = new StringBuilder(512);
|
||||
|
||||
dictionary.append("# deterministic fuzz dictionary seed ").append(seed).append('\n');
|
||||
dictionary.append("// blank and remark handling is part of the exercised input\n\n");
|
||||
|
||||
final int entryCount = 18 + random.nextInt(8);
|
||||
for (int index = 0; index < entryCount; index++) {
|
||||
final String stem = nextWord(random, 1, 8);
|
||||
final LinkedHashSet<String> variants = new LinkedHashSet<>();
|
||||
final int variantCount = 1 + random.nextInt(4);
|
||||
|
||||
while (variants.size() < variantCount) {
|
||||
if (random.nextInt(6) == 0) {
|
||||
variants.add(stem);
|
||||
} else {
|
||||
variants.add(createVariant(random, stem));
|
||||
}
|
||||
}
|
||||
|
||||
dictionary.append(stem);
|
||||
for (String variant : variants) {
|
||||
dictionary.append(' ').append(variant);
|
||||
expectedStemsByWord.computeIfAbsent(variant, ignored -> new LinkedHashSet<>()).add(stem);
|
||||
}
|
||||
dictionary.append(" # entry ").append(index).append('\n');
|
||||
|
||||
if (random.nextInt(5) == 0) {
|
||||
dictionary.append("\n");
|
||||
}
|
||||
}
|
||||
|
||||
return new StemmerDictionaryScenario(seed, dictionary.toString(), immutableMapOfSets(expectedStemsByWord));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a variant related to a supplied stem.
|
||||
*
|
||||
* @param random source of deterministic pseudo-randomness
|
||||
* @param stem canonical stem
|
||||
* @return generated variant
|
||||
*/
|
||||
private static String createVariant(final Random random, final String stem) {
|
||||
final int mode = random.nextInt(6);
|
||||
switch (mode) {
|
||||
case 0:
|
||||
return stem + suffix(random);
|
||||
case 1:
|
||||
return prefix(random) + stem;
|
||||
case 2:
|
||||
return stem.length() > 1 ? stem.substring(0, stem.length() - 1) + nextLetter(random) : stem + nextLetter(random);
|
||||
case 3:
|
||||
return stem + nextLetter(random) + nextLetter(random);
|
||||
case 4:
|
||||
return stem.length() > 2 ? stem.substring(0, stem.length() - 2) : stem;
|
||||
default:
|
||||
return new StringBuilder(stem).reverse().append(nextLetter(random)).toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a generated word in lower case.
|
||||
*
|
||||
* @param random source of deterministic pseudo-randomness
|
||||
* @param minLength minimum inclusive length
|
||||
* @param maxLength maximum inclusive length
|
||||
* @return generated word
|
||||
*/
|
||||
private static String nextWord(final Random random, final int minLength, final int maxLength) {
|
||||
final int length = minLength + random.nextInt(maxLength - minLength + 1);
|
||||
final StringBuilder builder = new StringBuilder(length);
|
||||
for (int index = 0; index < length; index++) {
|
||||
builder.append(nextLetter(random));
|
||||
}
|
||||
return builder.toString().toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns one generated prefix fragment.
|
||||
*
|
||||
* @param random source of deterministic pseudo-randomness
|
||||
* @return prefix fragment
|
||||
*/
|
||||
private static String prefix(final Random random) {
|
||||
return String.valueOf(nextLetter(random));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns one generated suffix fragment.
|
||||
*
|
||||
* @param random source of deterministic pseudo-randomness
|
||||
* @return suffix fragment
|
||||
*/
|
||||
private static String suffix(final Random random) {
|
||||
final String[] suffixes = { "s", "ed", "ing", "er", "ly", "ness", "ment" };
|
||||
return suffixes[random.nextInt(suffixes.length)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns one generated lower-case letter.
|
||||
*
|
||||
* @param random source of deterministic pseudo-randomness
|
||||
* @return generated character
|
||||
*/
|
||||
private static char nextLetter(final Random random) {
|
||||
return ALPHABET[random.nextInt(ALPHABET.length)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an immutable map view whose nested sets are also immutable.
|
||||
*
|
||||
* @param source mutable source map
|
||||
* @return immutable copy
|
||||
*/
|
||||
private static Map<String, Set<String>> immutableMapOfSets(final Map<String, Set<String>> source) {
|
||||
final Map<String, Set<String>> copy = new LinkedHashMap<>(source.size());
|
||||
for (Map.Entry<String, Set<String>> entry : source.entrySet()) {
|
||||
copy.put(entry.getKey(), Set.copyOf(entry.getValue()));
|
||||
}
|
||||
return Map.copyOf(copy);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generated trie scenario for deterministic fuzz testing.
|
||||
*
|
||||
* @param seed deterministic seed
|
||||
* @param insertions generated insertions to apply to the builder
|
||||
* @param observedKeys keys that should be checked after compilation
|
||||
*/
|
||||
record TrieCompilationScenario(long seed, List<TrieInsertion> insertions, List<String> observedKeys) {
|
||||
|
||||
/**
|
||||
* Creates a validated scenario.
|
||||
*
|
||||
* @param seed deterministic seed
|
||||
* @param insertions generated insertions to apply to the builder
|
||||
* @param observedKeys keys that should be checked after compilation
|
||||
*/
|
||||
TrieCompilationScenario {
|
||||
Objects.requireNonNull(insertions, "insertions");
|
||||
Objects.requireNonNull(observedKeys, "observedKeys");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "seed=" + this.seed;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* One generated insertion into a trie builder.
|
||||
*
|
||||
* @param key target key
|
||||
* @param value stored value
|
||||
* @param count positive occurrence count
|
||||
*/
|
||||
record TrieInsertion(String key, String value, int count) {
|
||||
|
||||
/**
|
||||
* Creates a validated insertion.
|
||||
*
|
||||
* @param key target key
|
||||
* @param value stored value
|
||||
* @param count positive occurrence count
|
||||
*/
|
||||
TrieInsertion {
|
||||
Objects.requireNonNull(key, "key");
|
||||
Objects.requireNonNull(value, "value");
|
||||
if (count < 1) {
|
||||
throw new IllegalArgumentException("count must be positive.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generated dictionary scenario for deterministic fuzz testing of stemming.
|
||||
*
|
||||
* @param seed deterministic seed
|
||||
* @param dictionaryContent generated dictionary content
|
||||
* @param expectedStemsByWord acceptable stems for each generated word
|
||||
*/
|
||||
record StemmerDictionaryScenario(long seed, String dictionaryContent, Map<String, Set<String>> expectedStemsByWord) {
|
||||
|
||||
/**
|
||||
* Creates a validated scenario.
|
||||
*
|
||||
* @param seed deterministic seed
|
||||
* @param dictionaryContent generated dictionary content
|
||||
* @param expectedStemsByWord acceptable stems for each generated word
|
||||
*/
|
||||
StemmerDictionaryScenario {
|
||||
Objects.requireNonNull(dictionaryContent, "dictionaryContent");
|
||||
Objects.requireNonNull(expectedStemsByWord, "expectedStemsByWord");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "seed=" + this.seed;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
import net.jqwik.api.ForAll;
|
||||
import net.jqwik.api.Label;
|
||||
import net.jqwik.api.Property;
|
||||
import net.jqwik.api.Tag;
|
||||
|
||||
/**
|
||||
* Property-based tests for {@link PatchCommandEncoder}.
|
||||
*
|
||||
* <p>
|
||||
* These properties protect the most important behavioral contract of the patch
|
||||
* language: encoding must be deterministic and applying an encoded patch must
|
||||
* reconstruct the exact requested target.
|
||||
*/
|
||||
@Label("PatchCommandEncoder properties")
|
||||
@Tag("unit")
|
||||
@Tag("property")
|
||||
@Tag("patch")
|
||||
class PatchCommandEncoderProperties extends PropertyBasedTestSupport {
|
||||
|
||||
/**
|
||||
* Verifies that encoding followed by application reconstructs the original
|
||||
* target word for bounded generated inputs.
|
||||
*
|
||||
* @param source source word
|
||||
* @param target target word
|
||||
*/
|
||||
@Property(tries = 200)
|
||||
@Label("encode followed by apply should reconstruct the target word")
|
||||
void encodeFollowedByApplyShouldReconstructTheTargetWord(@ForAll("words") final String source,
|
||||
@ForAll("words") final String target) {
|
||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
final String patch = encoder.encode(source, target);
|
||||
|
||||
assertNotNull(patch, "patch generation must succeed for non-null inputs.");
|
||||
assertEquals(target, PatchCommandEncoder.apply(source, patch),
|
||||
"applying the encoded patch must reconstruct the target word.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that encoding is deterministic for the same source-target pair, both
|
||||
* within one encoder instance and across fresh instances.
|
||||
*
|
||||
* @param source source word
|
||||
* @param target target word
|
||||
*/
|
||||
@Property(tries = 150)
|
||||
@Label("encode should be deterministic for one source-target pair")
|
||||
void encodeShouldBeDeterministicForOneSourceTargetPair(@ForAll("words") final String source,
|
||||
@ForAll("words") final String target) {
|
||||
final PatchCommandEncoder sharedEncoder = new PatchCommandEncoder();
|
||||
final String first = sharedEncoder.encode(source, target);
|
||||
final String second = sharedEncoder.encode(source, target);
|
||||
final String fresh = new PatchCommandEncoder().encode(source, target);
|
||||
|
||||
assertEquals(first, second, "one encoder instance must produce stable output.");
|
||||
assertEquals(first, fresh, "fresh encoder instances must produce the same patch output.");
|
||||
}
|
||||
}
|
||||
@@ -174,7 +174,13 @@ class PatchCommandEncoderTest {
|
||||
// 9
|
||||
Arguments.of(9, "", "-a"),
|
||||
// 10
|
||||
Arguments.of(10, "", "Ra"));
|
||||
Arguments.of(10, "", "Ra"),
|
||||
// 11
|
||||
Arguments.of(11, "abc", "D`"),
|
||||
// 12
|
||||
Arguments.of(12, "abc", "-`"),
|
||||
// 13
|
||||
Arguments.of(13, "", "D`"));
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
326
src/test/java/org/egothor/stemmer/PropertyBasedTestSupport.java
Normal file
326
src/test/java/org/egothor/stemmer/PropertyBasedTestSupport.java
Normal file
@@ -0,0 +1,326 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import net.jqwik.api.Arbitraries;
|
||||
import net.jqwik.api.Arbitrary;
|
||||
import net.jqwik.api.Combinators;
|
||||
import net.jqwik.api.Provide;
|
||||
import net.jqwik.api.arbitraries.ListArbitrary;
|
||||
|
||||
/**
|
||||
* Shared jqwik generators and helpers for property-based tests covering the
|
||||
* Radixor algorithmic core.
|
||||
*
|
||||
* <p>
|
||||
* The generated domains are intentionally bounded to keep CI execution time
|
||||
* predictable while still exploring a broad range of trie shapes, duplicate
|
||||
* insertions, missing lookups, and patch-command transformations.
|
||||
*/
|
||||
abstract class PropertyBasedTestSupport {
|
||||
|
||||
/**
|
||||
* Shared array factory for string tries.
|
||||
*/
|
||||
protected static final IntFunction<String[]> STRING_ARRAY_FACTORY = String[]::new;
|
||||
|
||||
/**
|
||||
* Provides bounded lowercase words suitable for trie keys, stems, and patch
|
||||
* encoder inputs.
|
||||
*
|
||||
* @return bounded word generator
|
||||
*/
|
||||
@Provide
|
||||
protected Arbitrary<String> words() {
|
||||
return Arbitraries.strings().withChars('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l')
|
||||
.ofMinLength(0).ofMaxLength(12);
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides non-empty lowercase words suitable for dictionary variants and
|
||||
* stems.
|
||||
*
|
||||
* @return bounded non-empty word generator
|
||||
*/
|
||||
@Provide
|
||||
protected Arbitrary<String> nonEmptyWords() {
|
||||
return Arbitraries.strings().withChars('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l')
|
||||
.ofMinLength(1).ofMaxLength(12);
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides bounded insertion scenarios for trie-focused properties.
|
||||
*
|
||||
* @return trie scenario generator
|
||||
*/
|
||||
@Provide
|
||||
protected Arbitrary<TrieScenario> trieScenarios() {
|
||||
final Arbitrary<TrieInsertion> insertionArbitrary = Combinators
|
||||
.combine(words(), nonEmptyWords(), Arbitraries.integers().between(1, 5)).as(TrieInsertion::new);
|
||||
|
||||
final ListArbitrary<TrieInsertion> insertions = insertionArbitrary.list().ofMinSize(1).ofMaxSize(24);
|
||||
final Arbitrary<List<String>> observedKeys = words().list().ofMinSize(0).ofMaxSize(16);
|
||||
|
||||
return Combinators.combine(insertions, observedKeys)
|
||||
.as((scenarioInsertions, additionalObservedKeys) -> new TrieScenario(scenarioInsertions,
|
||||
mergeObservedKeys(scenarioInsertions, additionalObservedKeys)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides bounded stemmer scenarios where each variant word maps to one or
|
||||
* more acceptable stems.
|
||||
*
|
||||
* @return stemmer scenario generator
|
||||
*/
|
||||
@Provide
|
||||
protected Arbitrary<StemmerScenario> stemmerScenarios() {
|
||||
final Arbitrary<StemmerEntry> entryArbitrary = Combinators
|
||||
.combine(nonEmptyWords(), nonEmptyWords().set().ofMinSize(1).ofMaxSize(4)).as((stem, variants) -> {
|
||||
final LinkedHashSet<String> normalizedVariants = new LinkedHashSet<>(variants);
|
||||
normalizedVariants.add(stem);
|
||||
return new StemmerEntry(stem, normalizedVariants);
|
||||
});
|
||||
|
||||
return entryArbitrary.list().ofMinSize(1).ofMaxSize(10).map(StemmerScenario::new);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a compiled trie from one generated scenario.
|
||||
*
|
||||
* @param scenario trie scenario
|
||||
* @param reductionMode reduction mode
|
||||
* @return compiled trie
|
||||
*/
|
||||
protected FrequencyTrie<String> buildTrie(final TrieScenario scenario, final ReductionMode reductionMode) {
|
||||
Objects.requireNonNull(scenario, "scenario");
|
||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
|
||||
for (TrieInsertion insertion : scenario.insertions()) {
|
||||
builder.put(insertion.key(), insertion.value(), insertion.count());
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a patch-command trie from one generated stemmer scenario.
|
||||
*
|
||||
* @param scenario stemmer scenario
|
||||
* @param reductionMode reduction mode
|
||||
* @param storeOriginal whether original stems should be stored using the
|
||||
* canonical no-op patch
|
||||
* @return compiled patch-command trie
|
||||
*/
|
||||
protected FrequencyTrie<String> buildStemmerTrie(final StemmerScenario scenario, final ReductionMode reductionMode,
|
||||
final boolean storeOriginal) {
|
||||
Objects.requireNonNull(scenario, "scenario");
|
||||
Objects.requireNonNull(reductionMode, "reductionMode");
|
||||
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(STRING_ARRAY_FACTORY, reductionMode);
|
||||
final PatchCommandEncoder encoder = new PatchCommandEncoder();
|
||||
|
||||
for (StemmerEntry entry : scenario.entries()) {
|
||||
if (storeOriginal) {
|
||||
builder.put(entry.stem(), PatchCommandEncoder.NOOP_PATCH);
|
||||
}
|
||||
for (String variant : entry.variants()) {
|
||||
if (!variant.equals(entry.stem())) {
|
||||
builder.put(variant, encoder.encode(variant, entry.stem()));
|
||||
}
|
||||
}
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges observed lookup keys while preserving order and keeping scenario keys
|
||||
* relevant to actual trie content.
|
||||
*
|
||||
* @param insertions inserted trie mappings
|
||||
* @param additionalObservedKeys extra lookup probes
|
||||
* @return merged lookup-key set
|
||||
*/
|
||||
private static Set<String> mergeObservedKeys(final List<TrieInsertion> insertions,
|
||||
final List<String> additionalObservedKeys) {
|
||||
final LinkedHashSet<String> observedKeys = new LinkedHashSet<>();
|
||||
for (TrieInsertion insertion : insertions) {
|
||||
observedKeys.add(insertion.key());
|
||||
}
|
||||
observedKeys.addAll(additionalObservedKeys);
|
||||
return observedKeys;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generated insertion into a trie builder.
|
||||
*
|
||||
* @param key trie key
|
||||
* @param value stored value
|
||||
* @param count positive insertion count
|
||||
*/
|
||||
protected record TrieInsertion(String key, String value, int count) {
|
||||
|
||||
/**
|
||||
* Creates a validated insertion descriptor.
|
||||
*
|
||||
* @param key trie key
|
||||
* @param value stored value
|
||||
* @param count positive insertion count
|
||||
*/
|
||||
public TrieInsertion {
|
||||
Objects.requireNonNull(key, "key");
|
||||
Objects.requireNonNull(value, "value");
|
||||
if (count < 1) {
|
||||
throw new IllegalArgumentException("count must be at least 1.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generated trie scenario used by multiple properties.
|
||||
*
|
||||
* @param insertions generated insertions
|
||||
* @param observedKeys lookup probes
|
||||
*/
|
||||
protected record TrieScenario(List<TrieInsertion> insertions, Set<String> observedKeys) {
|
||||
|
||||
/**
|
||||
* Creates a validated trie scenario.
|
||||
*
|
||||
* @param insertions generated insertions
|
||||
* @param observedKeys lookup probes
|
||||
*/
|
||||
public TrieScenario {
|
||||
Objects.requireNonNull(insertions, "insertions");
|
||||
Objects.requireNonNull(observedKeys, "observedKeys");
|
||||
insertions = List.copyOf(insertions);
|
||||
observedKeys = Set.copyOf(observedKeys);
|
||||
if (insertions.isEmpty()) {
|
||||
throw new IllegalArgumentException("insertions must not be empty.");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "TrieScenario[insertions=" + this.insertions.size() + ", observedKeys=" + this.observedKeys.size()
|
||||
+ "]";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generated stemmer dictionary line equivalent.
|
||||
*
|
||||
* @param stem canonical stem
|
||||
* @param variants variants accepted for the stem
|
||||
*/
|
||||
protected record StemmerEntry(String stem, Set<String> variants) {
|
||||
|
||||
/**
|
||||
* Creates a validated stemmer entry.
|
||||
*
|
||||
* @param stem canonical stem
|
||||
* @param variants variants accepted for the stem
|
||||
*/
|
||||
public StemmerEntry {
|
||||
Objects.requireNonNull(stem, "stem");
|
||||
Objects.requireNonNull(variants, "variants");
|
||||
variants = Set.copyOf(variants);
|
||||
if (stem.isEmpty()) {
|
||||
throw new IllegalArgumentException("stem must not be empty.");
|
||||
}
|
||||
if (variants.isEmpty()) {
|
||||
throw new IllegalArgumentException("variants must not be empty.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generated stemmer scenario used by patch-command trie properties.
|
||||
*
|
||||
* @param entries generated entries
|
||||
*/
|
||||
protected record StemmerScenario(List<StemmerEntry> entries) {
|
||||
|
||||
/**
|
||||
* Creates a validated stemmer scenario.
|
||||
*
|
||||
* @param entries generated entries
|
||||
*/
|
||||
public StemmerScenario {
|
||||
Objects.requireNonNull(entries, "entries");
|
||||
entries = List.copyOf(entries);
|
||||
if (entries.isEmpty()) {
|
||||
throw new IllegalArgumentException("entries must not be empty.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all known source words that should be probeable in the resulting
|
||||
* trie.
|
||||
*
|
||||
* @return observed lookup words
|
||||
*/
|
||||
public Set<String> observedWords() {
|
||||
final LinkedHashSet<String> observedWords = new LinkedHashSet<>();
|
||||
for (StemmerEntry entry : this.entries) {
|
||||
observedWords.add(entry.stem());
|
||||
observedWords.addAll(entry.variants());
|
||||
}
|
||||
return observedWords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all acceptable stems for one observed word.
|
||||
*
|
||||
* @param word observed word
|
||||
* @return acceptable stems
|
||||
*/
|
||||
public Set<String> acceptableStemsFor(final String word) {
|
||||
final LinkedHashSet<String> stems = new LinkedHashSet<>();
|
||||
for (StemmerEntry entry : this.entries) {
|
||||
if (entry.stem().equals(word) || entry.variants().contains(word)) {
|
||||
stems.add(entry.stem());
|
||||
}
|
||||
}
|
||||
return stems;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "StemmerScenario[entries=" + this.entries.size() + "]";
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,151 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import net.jqwik.api.ForAll;
|
||||
import net.jqwik.api.Label;
|
||||
import net.jqwik.api.Property;
|
||||
import net.jqwik.api.Tag;
|
||||
|
||||
/**
|
||||
* Property-based tests for patch-command stemmer tries.
|
||||
*
|
||||
* <p>
|
||||
* These properties verify the most important semantic contract of compiled
|
||||
* stemmer dictionaries: every patch returned for a known input word must decode
|
||||
* to one of the acceptable stems declared by the source scenario, and binary
|
||||
* persistence must not alter that behavior.
|
||||
*/
|
||||
@Label("Stemmer patch trie properties")
|
||||
@Tag("unit")
|
||||
@Tag("property")
|
||||
@Tag("stemming")
|
||||
class StemmerPatchTrieProperties extends PropertyBasedTestSupport {
|
||||
|
||||
/**
|
||||
* Verifies that every returned patch reconstructs only acceptable stems for the
|
||||
* observed word set represented by one generated stemmer scenario.
|
||||
*
|
||||
* @param scenario generated stemmer scenario
|
||||
* @param reductionMode reduction mode
|
||||
*/
|
||||
@Property(tries = 60)
|
||||
@Label("returned patches should reconstruct only acceptable stems")
|
||||
void returnedPatchesShouldReconstructOnlyAcceptableStems(@ForAll("stemmerScenarios") final StemmerScenario scenario,
|
||||
@ForAll final ReductionMode reductionMode) {
|
||||
final FrequencyTrie<String> trie = buildStemmerTrie(scenario, reductionMode, true);
|
||||
|
||||
for (String observedWord : scenario.observedWords()) {
|
||||
final Set<String> acceptableStems = scenario.acceptableStemsFor(observedWord);
|
||||
final String preferredPatch = trie.get(observedWord);
|
||||
final String[] allPatches = trie.getAll(observedWord);
|
||||
|
||||
assertTrue(preferredPatch != null && !preferredPatch.isEmpty(),
|
||||
"preferred patch must exist for an observed word.");
|
||||
assertTrue(allPatches.length >= 1, "at least one patch must exist for an observed word.");
|
||||
assertTrue(acceptableStems.contains(PatchCommandEncoder.apply(observedWord, preferredPatch)),
|
||||
"preferred patch reconstructed an unexpected stem.");
|
||||
|
||||
final Set<String> producedStems = applyAll(observedWord, allPatches);
|
||||
assertTrue(acceptableStems.containsAll(producedStems),
|
||||
"getAll() must not expose a patch that reconstructs an undeclared stem.");
|
||||
|
||||
if (acceptableStems.contains(observedWord)) {
|
||||
assertTrue(producedStems.contains(observedWord),
|
||||
"storeOriginal semantics must preserve the original stem among returned results.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that GZip-compressed binary persistence preserves patch-command trie
|
||||
* lookups.
|
||||
*
|
||||
* @param scenario generated stemmer scenario
|
||||
* @param reductionMode reduction mode
|
||||
*/
|
||||
@Property(tries = 30)
|
||||
@Label("binary persistence should preserve patch-command trie lookups")
|
||||
void binaryPersistenceShouldPreservePatchCommandTrieLookups(
|
||||
@ForAll("stemmerScenarios") final StemmerScenario scenario, @ForAll final ReductionMode reductionMode) {
|
||||
final FrequencyTrie<String> original = buildStemmerTrie(scenario, reductionMode, true);
|
||||
final FrequencyTrie<String> roundTripped = roundTripCompressed(original);
|
||||
|
||||
for (String observedWord : scenario.observedWords()) {
|
||||
assertEquals(original.get(observedWord), roundTripped.get(observedWord),
|
||||
"preferred patch lookup drifted after persistence.");
|
||||
assertArrayEquals(original.getAll(observedWord), roundTripped.getAll(observedWord),
|
||||
"complete patch result set drifted after persistence.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies all returned patches to the supplied source word.
|
||||
*
|
||||
* @param source source word
|
||||
* @param patches returned patches
|
||||
* @return decoded stem set
|
||||
*/
|
||||
private static Set<String> applyAll(final String source, final String[] patches) {
|
||||
final LinkedHashSet<String> stems = new LinkedHashSet<>();
|
||||
for (String patch : patches) {
|
||||
stems.add(PatchCommandEncoder.apply(source, patch));
|
||||
}
|
||||
return stems;
|
||||
}
|
||||
|
||||
/**
|
||||
* Round-trips one patch-command trie through the compressed binary helper.
|
||||
*
|
||||
* @param trie trie to persist and reload
|
||||
* @return reloaded trie
|
||||
*/
|
||||
private static FrequencyTrie<String> roundTripCompressed(final FrequencyTrie<String> trie) {
|
||||
try {
|
||||
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||
StemmerPatchTrieBinaryIO.write(trie, byteArrayOutputStream);
|
||||
return StemmerPatchTrieBinaryIO.read(new ByteArrayInputStream(byteArrayOutputStream.toByteArray()));
|
||||
} catch (IOException exception) {
|
||||
throw new UncheckedIOException("Unexpected compressed binary round-trip failure.", exception);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,148 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software must
|
||||
* display the following acknowledgement:
|
||||
* This product includes software developed by the Egothor project.
|
||||
*
|
||||
* 4. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer.trie;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link CompiledNode} and {@link NodeData} validation and
|
||||
* documented backing-array exposure.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("fast")
|
||||
@Tag("trie")
|
||||
@DisplayName("CompiledNode and NodeData")
|
||||
class CompiledNodeAndNodeDataTest {
|
||||
|
||||
/**
|
||||
* Verifies that {@link NodeData} rejects mismatched edge-related array lengths.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("NodeData rejects mismatched edge arrays")
|
||||
void nodeDataShouldRejectMismatchedEdgeArrays() {
|
||||
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new NodeData<String>(new char[] { 'a' }, new int[0], new String[0], new int[0]));
|
||||
|
||||
assertEquals("edgeLabels and childNodeIds must have the same length.", exception.getMessage());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that {@link NodeData} rejects mismatched value-related array
|
||||
* lengths.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("NodeData rejects mismatched value arrays")
|
||||
void nodeDataShouldRejectMismatchedValueArrays() {
|
||||
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new NodeData<String>(new char[0], new int[0], new String[] { "stem" }, new int[0]));
|
||||
|
||||
assertEquals("orderedValues and orderedCounts must have the same length.", exception.getMessage());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that {@link NodeData} continues to expose the documented backing
|
||||
* arrays directly.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("NodeData accessors expose documented backing arrays")
|
||||
void nodeDataAccessorsShouldExposeDocumentedBackingArrays() {
|
||||
final char[] edgeLabels = new char[] { 'a' };
|
||||
final int[] childNodeIds = new int[] { 7 };
|
||||
final String[] orderedValues = new String[] { "stem" };
|
||||
final int[] orderedCounts = new int[] { 3 };
|
||||
final NodeData<String> nodeData = new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts);
|
||||
|
||||
assertSame(edgeLabels, nodeData.edgeLabels());
|
||||
assertSame(childNodeIds, nodeData.childNodeIds());
|
||||
assertSame(orderedValues, nodeData.orderedValues());
|
||||
assertSame(orderedCounts, nodeData.orderedCounts());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that {@link CompiledNode} rejects mismatched edge and child arrays.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("CompiledNode rejects mismatched edge and child arrays")
|
||||
void compiledNodeShouldRejectMismatchedEdgeAndChildArrays() {
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<String>[] children = new CompiledNode[0];
|
||||
|
||||
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new CompiledNode<String>(new char[] { 'a' }, children, new String[0], new int[0]));
|
||||
|
||||
assertEquals("edgeLabels and children must have the same length.", exception.getMessage());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that {@link CompiledNode} rejects mismatched value arrays.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("CompiledNode rejects mismatched value arrays")
|
||||
void compiledNodeShouldRejectMismatchedValueArrays() {
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<String>[] children = new CompiledNode[0];
|
||||
|
||||
final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
|
||||
() -> new CompiledNode<String>(new char[0], children, new String[] { "stem" }, new int[0]));
|
||||
|
||||
assertEquals("orderedValues and orderedCounts must have the same length.", exception.getMessage());
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that {@link CompiledNode} continues to expose the documented backing
|
||||
* arrays directly.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("CompiledNode accessors expose documented backing arrays")
|
||||
void compiledNodeAccessorsShouldExposeDocumentedBackingArrays() {
|
||||
final char[] edgeLabels = new char[] { 'a' };
|
||||
@SuppressWarnings("unchecked")
|
||||
final CompiledNode<String>[] children = new CompiledNode[1];
|
||||
final String[] orderedValues = new String[] { "stem" };
|
||||
final int[] orderedCounts = new int[] { 5 };
|
||||
final CompiledNode<String> node = new CompiledNode<>(edgeLabels, children, orderedValues, orderedCounts);
|
||||
|
||||
assertSame(edgeLabels, node.edgeLabels());
|
||||
assertSame(children, node.children());
|
||||
assertSame(orderedValues, node.orderedValues());
|
||||
assertSame(orderedCounts, node.orderedCounts());
|
||||
}
|
||||
}
|
||||
253
tools/generate-pages-badges.py
Executable file
253
tools/generate-pages-badges.py
Executable file
@@ -0,0 +1,253 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate GitHub Pages badge endpoint JSON files from CI report artifacts.
|
||||
|
||||
This script derives compact machine-readable badge payloads from:
|
||||
|
||||
- JaCoCo XML coverage report
|
||||
- PIT mutation testing XML report
|
||||
- JMH CSV benchmark report
|
||||
|
||||
The generated JSON files are intended to be consumed by Shields endpoint badges.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
"""Parse command-line arguments."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate GitHub Pages badge metadata from build reports."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--jacoco-xml",
|
||||
required=True,
|
||||
help="Path to the JaCoCo XML report."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pit-xml",
|
||||
required=True,
|
||||
help="Path to the PIT XML report."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--jmh-csv",
|
||||
required=True,
|
||||
help="Path to the JMH CSV report."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--run-metrics-dir",
|
||||
required=True,
|
||||
help="Target directory for the current build badge JSON files."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--latest-metrics-dir",
|
||||
required=True,
|
||||
help="Target directory for the latest build badge JSON files."
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def write_json(target: Path, payload: dict[str, object]) -> None:
|
||||
"""Write a badge payload as formatted UTF-8 JSON."""
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(json.dumps(payload, indent=2) + os.linesep, encoding="utf-8")
|
||||
|
||||
|
||||
def unavailable_payload(label: str) -> dict[str, object]:
|
||||
"""Create a standard payload for unavailable metrics."""
|
||||
return {
|
||||
"schemaVersion": 1,
|
||||
"label": label,
|
||||
"message": "not available",
|
||||
"color": "lightgrey"
|
||||
}
|
||||
|
||||
|
||||
def color_for_percentage(value: float) -> str:
|
||||
"""Select a badge color for a percentage value."""
|
||||
if value >= 85.0:
|
||||
return "brightgreen"
|
||||
if value >= 70.0:
|
||||
return "green"
|
||||
if value >= 55.0:
|
||||
return "yellow"
|
||||
if value >= 40.0:
|
||||
return "orange"
|
||||
return "red"
|
||||
|
||||
|
||||
def color_for_speedup(value: float) -> str:
|
||||
"""Select a badge color for a speedup factor."""
|
||||
if value >= 4.0:
|
||||
return "brightgreen"
|
||||
if value >= 3.0:
|
||||
return "green"
|
||||
if value >= 2.0:
|
||||
return "yellow"
|
||||
if value >= 1.0:
|
||||
return "orange"
|
||||
return "red"
|
||||
|
||||
|
||||
def coverage_payload(jacoco_xml: Path) -> dict[str, object]:
|
||||
"""Build a line coverage badge payload from a JaCoCo XML report."""
|
||||
if not jacoco_xml.is_file():
|
||||
return unavailable_payload("coverage")
|
||||
|
||||
root = ET.parse(jacoco_xml).getroot()
|
||||
line_counter = None
|
||||
for counter in root.findall("counter"):
|
||||
if counter.attrib.get("type") == "LINE":
|
||||
line_counter = counter
|
||||
break
|
||||
|
||||
if line_counter is None:
|
||||
return unavailable_payload("coverage")
|
||||
|
||||
missed = int(line_counter.attrib.get("missed", "0"))
|
||||
covered = int(line_counter.attrib.get("covered", "0"))
|
||||
total = missed + covered
|
||||
percentage = 0.0 if total == 0 else (100.0 * covered / total)
|
||||
|
||||
return {
|
||||
"schemaVersion": 1,
|
||||
"label": "coverage",
|
||||
"message": f"{percentage:.1f}%",
|
||||
"color": color_for_percentage(percentage)
|
||||
}
|
||||
|
||||
|
||||
def mutation_payload(pit_xml: Path) -> dict[str, object]:
|
||||
"""Build a mutation score badge payload from a PIT XML report."""
|
||||
if not pit_xml.is_file():
|
||||
return unavailable_payload("mutation")
|
||||
|
||||
root = ET.parse(pit_xml).getroot()
|
||||
mutation_coverage = root.attrib.get("mutationCoverage")
|
||||
if mutation_coverage is not None:
|
||||
score = float(mutation_coverage)
|
||||
else:
|
||||
detected_statuses = {
|
||||
"KILLED",
|
||||
"TIMED_OUT",
|
||||
"MEMORY_ERROR",
|
||||
"RUN_ERROR",
|
||||
"NON_VIABLE"
|
||||
}
|
||||
mutations = root.findall("mutation")
|
||||
total = len(mutations)
|
||||
detected = sum(
|
||||
1
|
||||
for mutation in mutations
|
||||
if mutation.attrib.get("status") in detected_statuses
|
||||
)
|
||||
score = 0.0 if total == 0 else (100.0 * detected / total)
|
||||
|
||||
return {
|
||||
"schemaVersion": 1,
|
||||
"label": "mutation",
|
||||
"message": f"{score:.1f}%",
|
||||
"color": color_for_percentage(score)
|
||||
}
|
||||
|
||||
|
||||
def parse_family_count(row: dict[str, str]) -> int:
|
||||
"""Extract the JMH familyCount parameter from a CSV row."""
|
||||
for key, value in row.items():
|
||||
if key.startswith("Param: ") and key.endswith("familyCount"):
|
||||
try:
|
||||
return int(value)
|
||||
except (TypeError, ValueError):
|
||||
return -1
|
||||
return -1
|
||||
|
||||
|
||||
def benchmark_payload(jmh_csv: Path) -> dict[str, object]:
|
||||
"""Build a benchmark speedup badge payload from a JMH CSV report."""
|
||||
if not jmh_csv.is_file():
|
||||
return unavailable_payload("english benchmark")
|
||||
|
||||
with jmh_csv.open("r", encoding="utf-8", newline="") as input_file:
|
||||
rows = list(csv.DictReader(input_file))
|
||||
|
||||
if not rows:
|
||||
return unavailable_payload("english benchmark")
|
||||
|
||||
relevant_rows: list[tuple[int, str, float]] = []
|
||||
for row in rows:
|
||||
benchmark = row.get("Benchmark", "")
|
||||
if not benchmark.endswith(
|
||||
"EnglishStemmerComparisonBenchmark.radixorUsUkProfiPreferredStem"
|
||||
) and not benchmark.endswith(
|
||||
"EnglishStemmerComparisonBenchmark.snowballOriginalPorter"
|
||||
):
|
||||
continue
|
||||
|
||||
try:
|
||||
score = float(row["Score"])
|
||||
except (KeyError, TypeError, ValueError):
|
||||
continue
|
||||
|
||||
relevant_rows.append((parse_family_count(row), benchmark, score))
|
||||
|
||||
if not relevant_rows:
|
||||
return unavailable_payload("english benchmark")
|
||||
|
||||
best_family_count = max(family_count for family_count, _, _ in relevant_rows)
|
||||
radixor_score = None
|
||||
porter_score = None
|
||||
|
||||
for family_count, benchmark, score in relevant_rows:
|
||||
if family_count != best_family_count:
|
||||
continue
|
||||
if benchmark.endswith(".radixorUsUkProfiPreferredStem"):
|
||||
radixor_score = score
|
||||
elif benchmark.endswith(".snowballOriginalPorter"):
|
||||
porter_score = score
|
||||
|
||||
if radixor_score is None or porter_score is None or porter_score <= 0.0:
|
||||
return unavailable_payload("english benchmark")
|
||||
|
||||
# score is time for the batch processing, i.e. longer => slower, i.e. speedup is porter/radixor
|
||||
speedup = porter_score / radixor_score
|
||||
family_suffix = "" if best_family_count < 0 else f" ({best_family_count})"
|
||||
return {
|
||||
"schemaVersion": 1,
|
||||
"label": "english benchmark",
|
||||
"message": f"{speedup:.1f}x vs Porter{family_suffix}",
|
||||
"color": color_for_speedup(speedup)
|
||||
}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
"""Generate all requested badge metadata files."""
|
||||
arguments = parse_args()
|
||||
|
||||
jacoco_xml = Path(arguments.jacoco_xml)
|
||||
pit_xml = Path(arguments.pit_xml)
|
||||
jmh_csv = Path(arguments.jmh_csv)
|
||||
run_metrics_dir = Path(arguments.run_metrics_dir)
|
||||
latest_metrics_dir = Path(arguments.latest_metrics_dir)
|
||||
|
||||
payloads = {
|
||||
"coverage-badge.json": coverage_payload(jacoco_xml),
|
||||
"pitest-badge.json": mutation_payload(pit_xml),
|
||||
"jmh-badge.json": benchmark_payload(jmh_csv)
|
||||
}
|
||||
|
||||
for file_name, payload in payloads.items():
|
||||
write_json(run_metrics_dir / file_name, payload)
|
||||
write_json(latest_metrics_dir / file_name, payload)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
114
tools/generate-release-notes.sh
Executable file
114
tools/generate-release-notes.sh
Executable file
@@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env bash
|
||||
set -Eeuo pipefail
|
||||
|
||||
current_tag="${GITHUB_REF_NAME:-${1:-}}"
|
||||
if [[ -z "${current_tag}" ]]; then
|
||||
echo "Current tag is not set. Provide it as GITHUB_REF_NAME or as the first argument." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
release_prefix="release@"
|
||||
|
||||
if [[ "${current_tag}" != "${release_prefix}"* ]]; then
|
||||
echo "Current tag '${current_tag}' does not start with expected prefix '${release_prefix}'." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
git fetch --tags --force >/dev/null 2>&1 || true
|
||||
|
||||
all_versions="$(git tag --list "${release_prefix}*" | sed "s/^${release_prefix}//" | sort -V)"
|
||||
|
||||
previous_tag=""
|
||||
for version in ${all_versions}; do
|
||||
if [[ "${release_prefix}${version}" == "${current_tag}" ]]; then
|
||||
break
|
||||
fi
|
||||
previous_tag="${release_prefix}${version}"
|
||||
done
|
||||
|
||||
if [[ -n "${previous_tag}" ]]; then
|
||||
range="${previous_tag}..${current_tag}"
|
||||
else
|
||||
range="${current_tag}"
|
||||
fi
|
||||
|
||||
echo "Generating release notes for range: ${range}" >&2
|
||||
|
||||
declare -a CATEGORY_ORDER=(
|
||||
"feat|Features"
|
||||
"fix|Bug Fixes"
|
||||
"perf|Performance"
|
||||
"refactor|Refactoring"
|
||||
"docs|Documentation"
|
||||
"test|Tests"
|
||||
"build|Build System"
|
||||
"ci|CI/CD"
|
||||
"style|Style"
|
||||
"chore|Maintenance"
|
||||
"revert|Reverts"
|
||||
)
|
||||
|
||||
declare -A CATEGORY_TITLES
|
||||
declare -A CATEGORY_ITEMS
|
||||
|
||||
for entry in "${CATEGORY_ORDER[@]}"; do
|
||||
key="${entry%%|*}"
|
||||
title="${entry##*|}"
|
||||
CATEGORY_TITLES["${key}"]="${title}"
|
||||
CATEGORY_ITEMS["${key}"]=""
|
||||
done
|
||||
|
||||
supported_prefix_pattern='^(feat|fix|perf|refactor|docs|test|build|ci|style|chore|revert)(\([^)]+\))?!?:[[:space:]]*(.+)$'
|
||||
separator=$'\x1f'
|
||||
|
||||
append_line() {
|
||||
local line="$1"
|
||||
local normalized_line
|
||||
local category
|
||||
local message
|
||||
|
||||
normalized_line="$(printf '%s' "${line}" | tr -d '\r' | sed 's/^[[:space:]]*//; s/[[:space:]]*$//')"
|
||||
[[ -z "${normalized_line}" ]] && return 0
|
||||
|
||||
if [[ "${normalized_line}" =~ ${supported_prefix_pattern} ]]; then
|
||||
category="${BASH_REMATCH[1]}"
|
||||
message="${BASH_REMATCH[3]}"
|
||||
|
||||
[[ -z "${message}" ]] && return 0
|
||||
|
||||
CATEGORY_ITEMS["${category}"]+="- ${message}"$'\n'
|
||||
fi
|
||||
}
|
||||
|
||||
while IFS="${separator}" read -r commit_hash subject body; do
|
||||
[[ -z "${commit_hash}" ]] && continue
|
||||
|
||||
if [[ "${subject}" =~ ^Merge[[:space:]] ]] || [[ "${subject}" == "Initial commit" ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
append_line "${subject}"
|
||||
|
||||
while IFS= read -r body_line; do
|
||||
append_line "${body_line}"
|
||||
done <<< "${body}"
|
||||
done < <(git log "${range}" --no-merges --pretty=format:"%H${separator}%s${separator}%b")
|
||||
|
||||
body_text="## What's New"
|
||||
|
||||
for entry in "${CATEGORY_ORDER[@]}"; do
|
||||
key="${entry%%|*}"
|
||||
title="${CATEGORY_TITLES[${key}]}"
|
||||
items="${CATEGORY_ITEMS[${key}]}"
|
||||
|
||||
if [[ -n "${items}" ]]; then
|
||||
body_text+=$'\n\n'"### ${title}"$'\n'
|
||||
body_text+="$(printf '%s' "${items}" | sed '/^[[:space:]]*$/d')"
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "${body_text}" == "## What's New" ]]; then
|
||||
body_text+=$'\n\n'"No categorized changes were found in commit subjects or bodies for this release range."
|
||||
fi
|
||||
|
||||
printf '%s\n' "${body_text}"
|
||||
Reference in New Issue
Block a user