diff --git a/.classpath b/.classpath
index 6c1e625..e3393aa 100644
--- a/.classpath
+++ b/.classpath
@@ -3,20 +3,27 @@
-
+
-
+
-
+
+
+
+
+
+
+
+
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
new file mode 100644
index 0000000..9b51665
--- /dev/null
+++ b/.github/workflows/benchmarks.yml
@@ -0,0 +1,51 @@
+name: Benchmarks
+
+on:
+ workflow_dispatch:
+ schedule:
+ - cron: '0 3 * * 1'
+ push:
+ branches:
+ - main
+ paths:
+ - 'src/main/**'
+ - 'src/jmh/**'
+ - 'build.gradle'
+ - 'gradle/**'
+ - 'gradlew'
+ - 'gradlew.bat'
+ - '.github/workflows/benchmarks.yml'
+
+jobs:
+ jmh:
+ runs-on: ubuntu-latest
+ timeout-minutes: 30
+
+ permissions:
+ contents: read
+
+ steps:
+ - name: Check out sources
+ uses: actions/checkout@v4
+
+ - name: Set up JDK 21
+ uses: actions/setup-java@v4
+ with:
+ distribution: temurin
+ java-version: '21'
+ cache: gradle
+
+ - name: Make Gradle executable
+ run: chmod +x ./gradlew
+
+ - name: Run JMH benchmarks
+ run: ./gradlew clean jmh --no-daemon
+
+ - name: Upload JMH reports
+ uses: actions/upload-artifact@v4
+ with:
+ name: jmh-reports
+ path: |
+ build/reports/jmh/**
+ build/results/jmh/**
+ if-no-files-found: warn
\ No newline at end of file
diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml
index 6daaf0a..02b2931 100644
--- a/.github/workflows/pages.yml
+++ b/.github/workflows/pages.yml
@@ -4,6 +4,16 @@ on:
push:
branches:
- main
+ paths:
+ - 'src/main/**'
+ - 'src/test/**'
+ - 'src/jmh/**'
+ - 'build.gradle'
+ - 'settings.gradle'
+ - 'gradle/**'
+ - 'gradlew'
+ - 'gradlew.bat'
+ - '.github/workflows/pages.yml'
workflow_dispatch:
permissions:
@@ -37,7 +47,7 @@ jobs:
uses: gradle/actions/setup-gradle@v4
- name: Build reports for publication
- run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport pitest
+ run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport pitest jmh
- name: Prepare gh-pages worktree
shell: bash
diff --git a/build.gradle b/build.gradle
index 90bb2e7..4b40b17 100644
--- a/build.gradle
+++ b/build.gradle
@@ -1,15 +1,19 @@
plugins {
id 'java'
+ id 'eclipse'
id 'application'
id 'pmd'
id 'jacoco'
id 'info.solidsoft.pitest' version '1.19.0'
+ id 'me.champeau.jmh' version '0.7.2'
id 'com.palantir.git-version' version '4.0.0'
}
group = 'org.egothor.stemmer'
version = gitVersion(prefix:'release@')
+def benchmarkReportsDirectory = layout.buildDirectory.dir('reports/jmh')
+
configurations {
mockitoAgent
}
@@ -34,6 +38,8 @@ repositories {
}
dependencies {
+ jmhImplementation sourceSets.main.output
+
testImplementation platform(libs.junit.bom)
testImplementation libs.junit.jupiter
testRuntimeOnly libs.junit.platform.launcher
@@ -104,6 +110,24 @@ application {
mainClass = 'org.egothor.stemmer.Compile'
}
+jmh {
+ jmhVersion = '1.37'
+ warmupIterations = 3
+ iterations = 5
+ fork = 1
+ benchmarkMode = ['avgt']
+ timeUnit = 'ns'
+ resultFormat = 'CSV'
+ resultsFile = benchmarkReportsDirectory.map { it.file('jmh-results.csv').asFile }.get()
+ humanOutputFile = benchmarkReportsDirectory.map { it.file('jmh-results.txt').asFile }.get()
+ duplicateClassesStrategy = DuplicatesStrategy.EXCLUDE
+}
+
+tasks.named('jmh') {
+ group = 'verification'
+ description = 'Runs JMH benchmarks for the Radixor algorithmic core.'
+}
+
javadoc {
failOnError = false
diff --git a/src/jmh/java/org/egothor/stemmer/benchmark/BenchmarkCorpusSupport.java b/src/jmh/java/org/egothor/stemmer/benchmark/BenchmarkCorpusSupport.java
new file mode 100644
index 0000000..f335861
--- /dev/null
+++ b/src/jmh/java/org/egothor/stemmer/benchmark/BenchmarkCorpusSupport.java
@@ -0,0 +1,208 @@
+package org.egothor.stemmer.benchmark;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Objects;
+import java.util.SplittableRandom;
+import org.egothor.stemmer.FrequencyTrie;
+import org.egothor.stemmer.PatchCommandEncoder;
+import org.egothor.stemmer.ReductionSettings;
+import org.egothor.stemmer.StemmerDictionaryParser;
+
+/**
+ * Builds deterministic benchmark corpora used by the JMH suite.
+ *
+ *
+ * The generated corpus is intentionally synthetic but morphology-shaped: it
+ * creates a stable base vocabulary and derives common inflectional and
+ * derivational variants from each stem. The corpus also injects a controlled
+ * amount of homograph ambiguity so that {@link FrequencyTrie#getAll(String)} is
+ * measured on keys that really produce multiple candidate patch commands.
+ *
+ */
+final class BenchmarkCorpusSupport {
+
+ /**
+ * Prefixes used to synthesize pronounceable stems.
+ */
+ private static final String[] PREFIXES = {
+ "adapt", "align", "anchor", "answer", "apply", "balance", "build", "capture", "center",
+ "change", "collect", "connect", "convert", "cover", "create", "cycle", "declare", "define",
+ "deliver", "derive", "design", "detect", "develop", "drive", "encode", "extend", "filter",
+ "form", "govern", "handle", "improve", "index", "inform", "inspect", "join", "launch",
+ "limit", "manage", "map", "model", "move", "observe", "operate", "organ", "pattern",
+ "perform", "plan", "predict", "prepare", "process", "project", "protect", "publish", "query",
+ "reduce", "refresh", "render", "repeat", "resolve", "return", "scale", "search", "select",
+ "shape", "signal", "sort", "state", "store", "stream", "structure", "supply", "support",
+ "switch", "trace", "transform", "update", "validate", "value"
+ };
+
+ /**
+ * Suffixes used to diversify stems.
+ */
+ private static final String[] STEM_SUFFIXES = {
+ "", "er", "or", "al", "ive", "ion", "ent", "ant", "ure", "ment", "ist", "ity"
+ };
+
+ /**
+ * Number of neighboring stems sharing one ambiguous surface form.
+ */
+ private static final int HOMOGRAPH_GROUP_SIZE = 4;
+
+ /**
+ * Utility class.
+ */
+ private BenchmarkCorpusSupport() {
+ throw new AssertionError("No instances.");
+ }
+
+ /**
+ * Creates a deterministic benchmark corpus.
+ *
+ * @param stemCount number of canonical stems to generate
+ * @return immutable benchmark corpus description
+ */
+ static BenchmarkCorpus createCorpus(final int stemCount) {
+ if (stemCount < 1) {
+ throw new IllegalArgumentException("stemCount must be at least 1.");
+ }
+
+ final StringBuilder dictionaryBuilder = new StringBuilder(stemCount * 120);
+ final LinkedHashSet lookupKeys = new LinkedHashSet<>(stemCount * 8);
+ final LinkedHashSet ambiguousLookupKeys = new LinkedHashSet<>(Math.max(1, stemCount / 4));
+ final SplittableRandom random = new SplittableRandom(20260414L);
+
+ for (int index = 0; index < stemCount; index++) {
+ final String stem = createStem(index);
+ final String[] variants = createVariants(stem, random, index);
+
+ dictionaryBuilder.append(stem);
+ lookupKeys.add(stem);
+ for (String variant : variants) {
+ dictionaryBuilder.append(' ').append(variant);
+ lookupKeys.add(variant);
+ }
+
+ final String homograph = createHomograph(index);
+ dictionaryBuilder.append(' ').append(homograph);
+ lookupKeys.add(homograph);
+ ambiguousLookupKeys.add(homograph);
+
+ dictionaryBuilder.append('\n');
+ }
+
+ return new BenchmarkCorpus(
+ dictionaryBuilder.toString(),
+ lookupKeys.toArray(String[]::new),
+ ambiguousLookupKeys.toArray(String[]::new));
+ }
+
+ /**
+ * Builds a compiled trie from benchmark corpus text.
+ *
+ * @param corpusText line-oriented dictionary text
+ * @param reductionSettings reduction settings
+ * @param storeOriginalStem whether the canonical stem itself should also be
+ * inserted with the no-op patch
+ * @return compiled trie containing patch commands
+ * @throws IOException if parsing fails
+ */
+ static FrequencyTrie compilePatchTrie(
+ final String corpusText,
+ final ReductionSettings reductionSettings,
+ final boolean storeOriginalStem) throws IOException {
+ Objects.requireNonNull(corpusText, "corpusText");
+ Objects.requireNonNull(reductionSettings, "reductionSettings");
+
+ final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
+ final PatchCommandEncoder encoder = new PatchCommandEncoder();
+
+ StemmerDictionaryParser.parse(
+ new StringReader(corpusText),
+ "benchmark-corpus",
+ (stem, variants, lineNumber) -> {
+ if (storeOriginalStem) {
+ builder.put(stem, encoder.encode(stem, stem));
+ }
+ for (String variant : variants) {
+ builder.put(variant, encoder.encode(variant, stem));
+ }
+ });
+
+ return builder.build();
+ }
+
+ /**
+ * Creates one deterministic stem.
+ *
+ * @param index stem ordinal
+ * @return generated stem
+ */
+ private static String createStem(final int index) {
+ final String prefix = PREFIXES[index % PREFIXES.length];
+ final String suffix = STEM_SUFFIXES[(index / PREFIXES.length) % STEM_SUFFIXES.length];
+ return (prefix + suffix + base36(index)).toLowerCase(Locale.ROOT);
+ }
+
+ /**
+ * Creates a set of deterministic variants for one stem.
+ *
+ * @param stem canonical stem
+ * @param random deterministic random source
+ * @param index stem ordinal
+ * @return generated variants in stable order
+ */
+ private static String[] createVariants(final String stem, final SplittableRandom random, final int index) {
+ final List variants = new ArrayList<>(8);
+ variants.add(stem + "s");
+ variants.add(stem + "ed");
+ variants.add(stem + "ing");
+ variants.add(stem + "er");
+ variants.add(stem + "ers");
+ variants.add("pre" + stem);
+ variants.add(stem + random.nextInt(10));
+
+ if ((index & 1) == 0) {
+ variants.add(stem + "ly");
+ }
+ if (stem.length() > 5) {
+ variants.add(stem.substring(0, stem.length() - 1));
+ }
+ return variants.toArray(String[]::new);
+ }
+
+ /**
+ * Creates an ambiguous surface form shared by a small group of stems.
+ *
+ * @param index stem ordinal
+ * @return shared homograph form
+ */
+ private static String createHomograph(final int index) {
+ return "shared" + base36(index / HOMOGRAPH_GROUP_SIZE);
+ }
+
+ /**
+ * Converts an ordinal into a compact base-36 discriminator.
+ *
+ * @param value numeric value
+ * @return compact discriminator
+ */
+ private static String base36(final int value) {
+ return Integer.toString(value, Character.MAX_RADIX);
+ }
+
+ /**
+ * Immutable benchmark corpus.
+ *
+ * @param dictionaryText full line-oriented dictionary text
+ * @param lookupKeys keys used for general lookup measurements
+ * @param ambiguousLookupKeys keys that return multiple patch candidates from
+ * {@code getAll()}
+ */
+ record BenchmarkCorpus(String dictionaryText, String[] lookupKeys, String[] ambiguousLookupKeys) {
+ }
+}
diff --git a/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieCompilationBenchmark.java b/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieCompilationBenchmark.java
new file mode 100644
index 0000000..d77b461
--- /dev/null
+++ b/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieCompilationBenchmark.java
@@ -0,0 +1,94 @@
+package org.egothor.stemmer.benchmark;
+
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+import org.egothor.stemmer.ReductionMode;
+import org.egothor.stemmer.ReductionSettings;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Benchmarks end-to-end dictionary compilation for different reduction modes.
+ *
+ *
+ * This benchmark measures the offline path that matters for dictionary build
+ * workflows: dictionary parsing, patch-command generation, mutable trie
+ * population, subtree reduction, and freezing into the compiled read-only trie.
+ *
+ */
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+public class FrequencyTrieCompilationBenchmark {
+
+ /**
+ * Shared benchmark state for compilation scenarios.
+ */
+ @State(Scope.Benchmark)
+ public static class CompilationState {
+
+ /**
+ * Number of canonical stems to generate.
+ */
+ @Param({ "2000", "10000" })
+ public int stemCount;
+
+ /**
+ * Reduction mode used during trie compilation.
+ */
+ @Param({
+ "MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS",
+ "MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS",
+ "MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS"
+ })
+ public String reductionMode;
+
+ /**
+ * Whether to store the stem itself using the canonical no-op patch.
+ */
+ @Param({ "true", "false" })
+ public boolean storeOriginalStem;
+
+ /**
+ * Full dictionary text used as the benchmark input.
+ */
+ private String dictionaryText;
+
+ /**
+ * Initializes the benchmark state.
+ */
+ @Setup(Level.Trial)
+ public void setUp() {
+ this.dictionaryText = BenchmarkCorpusSupport.createCorpus(this.stemCount).dictionaryText();
+ }
+ }
+
+ /**
+ * Measures end-to-end patch trie compilation latency.
+ *
+ * @param state prepared compilation state
+ * @param blackhole sink preventing dead-code elimination
+ * @throws IOException if dictionary parsing fails
+ */
+ @Benchmark
+ public void compilePatchTrie(final CompilationState state, final Blackhole blackhole) throws IOException {
+ final ReductionSettings settings =
+ ReductionSettings.withDefaults(ReductionMode.valueOf(state.reductionMode));
+ blackhole.consume(
+ BenchmarkCorpusSupport.compilePatchTrie(
+ state.dictionaryText,
+ settings,
+ state.storeOriginalStem));
+ }
+}
diff --git a/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieLookupBenchmark.java b/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieLookupBenchmark.java
new file mode 100644
index 0000000..3325af3
--- /dev/null
+++ b/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieLookupBenchmark.java
@@ -0,0 +1,160 @@
+package org.egothor.stemmer.benchmark;
+
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+import org.egothor.stemmer.FrequencyTrie;
+import org.egothor.stemmer.PatchCommandEncoder;
+import org.egothor.stemmer.ReductionMode;
+import org.egothor.stemmer.ReductionSettings;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+/**
+ * Benchmarks lookup-oriented operations on compiled Radixor tries.
+ *
+ *
+ * The benchmark uses a deterministic morphology-shaped corpus and measures the
+ * latency of the hot-path lookup operations that are relevant at runtime:
+ * retrieving the preferred patch command, retrieving all candidate patch
+ * commands, and reconstructing stems from the returned patch values.
+ *
+ */
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+public class FrequencyTrieLookupBenchmark {
+
+ /**
+ * Shared benchmark state for lookup scenarios.
+ */
+ @State(Scope.Benchmark)
+ public static class LookupState {
+
+ /**
+ * Number of canonical stems to generate.
+ */
+ @Param({ "2000", "10000" })
+ public int stemCount;
+
+ /**
+ * Reduction mode used to compile the lookup trie.
+ */
+ @Param({
+ "MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS",
+ "MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS",
+ "MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS"
+ })
+ public String reductionMode;
+
+ /**
+ * Compiled trie under test.
+ */
+ private FrequencyTrie trie;
+
+ /**
+ * Deterministic lookup keys.
+ */
+ private String[] lookupKeys;
+
+ /**
+ * Keys that are known to return multiple patch candidates from
+ * {@code getAll()}.
+ */
+ private String[] ambiguousLookupKeys;
+
+ /**
+ * Initializes the benchmark state.
+ *
+ * @throws IOException if corpus compilation fails
+ */
+ @Setup(Level.Trial)
+ public void setUp() throws IOException {
+ final BenchmarkCorpusSupport.BenchmarkCorpus corpus = BenchmarkCorpusSupport.createCorpus(this.stemCount);
+ final ReductionSettings settings =
+ ReductionSettings.withDefaults(ReductionMode.valueOf(this.reductionMode));
+ this.trie = BenchmarkCorpusSupport.compilePatchTrie(corpus.dictionaryText(), settings, true);
+ this.lookupKeys = corpus.lookupKeys();
+ this.ambiguousLookupKeys = corpus.ambiguousLookupKeys();
+ }
+ }
+
+ /**
+ * Measures preferred patch lookup latency.
+ *
+ * @param state prepared lookup state
+ * @param blackhole sink preventing dead-code elimination
+ */
+ @Benchmark
+ public void lookupPreferredPatch(final LookupState state, final Blackhole blackhole) {
+ final String[] keys = state.lookupKeys;
+ for (String key : keys) {
+ final String patch = state.trie.get(key);
+ if (patch == null) {
+ throw new IllegalStateException("Missing preferred patch for key " + key + '.');
+ }
+ blackhole.consume(patch);
+ }
+ }
+
+ /**
+ * Measures retrieval of all patch candidates on ambiguous forms.
+ *
+ * @param state prepared lookup state
+ * @param blackhole sink preventing dead-code elimination
+ */
+ @Benchmark
+ public void lookupAllPatches(final LookupState state, final Blackhole blackhole) {
+ final String[] keys = state.ambiguousLookupKeys;
+ for (String key : keys) {
+ final String[] patches = state.trie.getAll(key);
+ if (patches.length < 2) {
+ throw new IllegalStateException("Expected multiple patches for key " + key + '.');
+ }
+ blackhole.consume(patches);
+ }
+ }
+
+ /**
+ * Measures end-to-end preferred stemming from lookup plus patch application.
+ *
+ * @param state prepared lookup state
+ * @param blackhole sink preventing dead-code elimination
+ */
+ @Benchmark
+ public void stemPreferredVariant(final LookupState state, final Blackhole blackhole) {
+ final String[] keys = state.lookupKeys;
+ for (String key : keys) {
+ final String patch = state.trie.get(key);
+ blackhole.consume(PatchCommandEncoder.apply(key, patch));
+ }
+ }
+
+ /**
+ * Measures end-to-end full candidate stemming from {@code getAll()} plus
+ * patch application.
+ *
+ * @param state prepared lookup state
+ * @param blackhole sink preventing dead-code elimination
+ */
+ @Benchmark
+ public void stemAllVariants(final LookupState state, final Blackhole blackhole) {
+ final String[] keys = state.ambiguousLookupKeys;
+ for (String key : keys) {
+ final String[] patches = state.trie.getAll(key);
+ for (String patch : patches) {
+ blackhole.consume(PatchCommandEncoder.apply(key, patch));
+ }
+ }
+ }
+}
diff --git a/src/jmh/java/org/egothor/stemmer/benchmark/package-info.java b/src/jmh/java/org/egothor/stemmer/benchmark/package-info.java
new file mode 100644
index 0000000..592bf60
--- /dev/null
+++ b/src/jmh/java/org/egothor/stemmer/benchmark/package-info.java
@@ -0,0 +1,10 @@
+/**
+ * JMH benchmarks for the Radixor algorithmic core.
+ *
+ *
+ * The benchmarks in this package focus on trie lookup latency, retrieval of all
+ * candidate patch commands, and end-to-end dictionary compilation with
+ * different reduction modes.
+ *
+ */
+package org.egothor.stemmer.benchmark;