From 85e33f2f60ae3c5cc6dbbb2f5eadb1f5ed74573c Mon Sep 17 00:00:00 2001 From: Leo Galambos Date: Tue, 14 Apr 2026 02:40:30 +0200 Subject: [PATCH] feat: JMH benchmarks added --- .classpath | 13 +- .github/workflows/benchmarks.yml | 51 +++++ .github/workflows/pages.yml | 12 +- build.gradle | 24 ++ .../benchmark/BenchmarkCorpusSupport.java | 208 ++++++++++++++++++ .../FrequencyTrieCompilationBenchmark.java | 94 ++++++++ .../FrequencyTrieLookupBenchmark.java | 160 ++++++++++++++ .../stemmer/benchmark/package-info.java | 10 + 8 files changed, 568 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/benchmarks.yml create mode 100644 src/jmh/java/org/egothor/stemmer/benchmark/BenchmarkCorpusSupport.java create mode 100644 src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieCompilationBenchmark.java create mode 100644 src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieLookupBenchmark.java create mode 100644 src/jmh/java/org/egothor/stemmer/benchmark/package-info.java diff --git a/.classpath b/.classpath index 6c1e625..e3393aa 100644 --- a/.classpath +++ b/.classpath @@ -3,20 +3,27 @@ - + - + - + + + + + + + + diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 0000000..9b51665 --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,51 @@ +name: Benchmarks + +on: + workflow_dispatch: + schedule: + - cron: '0 3 * * 1' + push: + branches: + - main + paths: + - 'src/main/**' + - 'src/jmh/**' + - 'build.gradle' + - 'gradle/**' + - 'gradlew' + - 'gradlew.bat' + - '.github/workflows/benchmarks.yml' + +jobs: + jmh: + runs-on: ubuntu-latest + timeout-minutes: 30 + + permissions: + contents: read + + steps: + - name: Check out sources + uses: actions/checkout@v4 + + - name: Set up JDK 21 + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: '21' + cache: gradle + + - name: Make Gradle executable + run: chmod +x ./gradlew + + - name: Run JMH benchmarks + run: ./gradlew clean jmh --no-daemon + + - name: Upload JMH reports + uses: actions/upload-artifact@v4 + with: + name: jmh-reports + path: | + build/reports/jmh/** + build/results/jmh/** + if-no-files-found: warn \ No newline at end of file diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 6daaf0a..02b2931 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -4,6 +4,16 @@ on: push: branches: - main + paths: + - 'src/main/**' + - 'src/test/**' + - 'src/jmh/**' + - 'build.gradle' + - 'settings.gradle' + - 'gradle/**' + - 'gradlew' + - 'gradlew.bat' + - '.github/workflows/pages.yml' workflow_dispatch: permissions: @@ -37,7 +47,7 @@ jobs: uses: gradle/actions/setup-gradle@v4 - name: Build reports for publication - run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport pitest + run: ./gradlew --no-daemon clean build pmdMain javadoc jacocoTestReport pitest jmh - name: Prepare gh-pages worktree shell: bash diff --git a/build.gradle b/build.gradle index 90bb2e7..4b40b17 100644 --- a/build.gradle +++ b/build.gradle @@ -1,15 +1,19 @@ plugins { id 'java' + id 'eclipse' id 'application' id 'pmd' id 'jacoco' id 'info.solidsoft.pitest' version '1.19.0' + id 'me.champeau.jmh' version '0.7.2' id 'com.palantir.git-version' version '4.0.0' } group = 'org.egothor.stemmer' version = gitVersion(prefix:'release@') +def benchmarkReportsDirectory = layout.buildDirectory.dir('reports/jmh') + configurations { mockitoAgent } @@ -34,6 +38,8 @@ repositories { } dependencies { + jmhImplementation sourceSets.main.output + testImplementation platform(libs.junit.bom) testImplementation libs.junit.jupiter testRuntimeOnly libs.junit.platform.launcher @@ -104,6 +110,24 @@ application { mainClass = 'org.egothor.stemmer.Compile' } +jmh { + jmhVersion = '1.37' + warmupIterations = 3 + iterations = 5 + fork = 1 + benchmarkMode = ['avgt'] + timeUnit = 'ns' + resultFormat = 'CSV' + resultsFile = benchmarkReportsDirectory.map { it.file('jmh-results.csv').asFile }.get() + humanOutputFile = benchmarkReportsDirectory.map { it.file('jmh-results.txt').asFile }.get() + duplicateClassesStrategy = DuplicatesStrategy.EXCLUDE +} + +tasks.named('jmh') { + group = 'verification' + description = 'Runs JMH benchmarks for the Radixor algorithmic core.' +} + javadoc { failOnError = false diff --git a/src/jmh/java/org/egothor/stemmer/benchmark/BenchmarkCorpusSupport.java b/src/jmh/java/org/egothor/stemmer/benchmark/BenchmarkCorpusSupport.java new file mode 100644 index 0000000..f335861 --- /dev/null +++ b/src/jmh/java/org/egothor/stemmer/benchmark/BenchmarkCorpusSupport.java @@ -0,0 +1,208 @@ +package org.egothor.stemmer.benchmark; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Objects; +import java.util.SplittableRandom; +import org.egothor.stemmer.FrequencyTrie; +import org.egothor.stemmer.PatchCommandEncoder; +import org.egothor.stemmer.ReductionSettings; +import org.egothor.stemmer.StemmerDictionaryParser; + +/** + * Builds deterministic benchmark corpora used by the JMH suite. + * + *

+ * The generated corpus is intentionally synthetic but morphology-shaped: it + * creates a stable base vocabulary and derives common inflectional and + * derivational variants from each stem. The corpus also injects a controlled + * amount of homograph ambiguity so that {@link FrequencyTrie#getAll(String)} is + * measured on keys that really produce multiple candidate patch commands. + *

+ */ +final class BenchmarkCorpusSupport { + + /** + * Prefixes used to synthesize pronounceable stems. + */ + private static final String[] PREFIXES = { + "adapt", "align", "anchor", "answer", "apply", "balance", "build", "capture", "center", + "change", "collect", "connect", "convert", "cover", "create", "cycle", "declare", "define", + "deliver", "derive", "design", "detect", "develop", "drive", "encode", "extend", "filter", + "form", "govern", "handle", "improve", "index", "inform", "inspect", "join", "launch", + "limit", "manage", "map", "model", "move", "observe", "operate", "organ", "pattern", + "perform", "plan", "predict", "prepare", "process", "project", "protect", "publish", "query", + "reduce", "refresh", "render", "repeat", "resolve", "return", "scale", "search", "select", + "shape", "signal", "sort", "state", "store", "stream", "structure", "supply", "support", + "switch", "trace", "transform", "update", "validate", "value" + }; + + /** + * Suffixes used to diversify stems. + */ + private static final String[] STEM_SUFFIXES = { + "", "er", "or", "al", "ive", "ion", "ent", "ant", "ure", "ment", "ist", "ity" + }; + + /** + * Number of neighboring stems sharing one ambiguous surface form. + */ + private static final int HOMOGRAPH_GROUP_SIZE = 4; + + /** + * Utility class. + */ + private BenchmarkCorpusSupport() { + throw new AssertionError("No instances."); + } + + /** + * Creates a deterministic benchmark corpus. + * + * @param stemCount number of canonical stems to generate + * @return immutable benchmark corpus description + */ + static BenchmarkCorpus createCorpus(final int stemCount) { + if (stemCount < 1) { + throw new IllegalArgumentException("stemCount must be at least 1."); + } + + final StringBuilder dictionaryBuilder = new StringBuilder(stemCount * 120); + final LinkedHashSet lookupKeys = new LinkedHashSet<>(stemCount * 8); + final LinkedHashSet ambiguousLookupKeys = new LinkedHashSet<>(Math.max(1, stemCount / 4)); + final SplittableRandom random = new SplittableRandom(20260414L); + + for (int index = 0; index < stemCount; index++) { + final String stem = createStem(index); + final String[] variants = createVariants(stem, random, index); + + dictionaryBuilder.append(stem); + lookupKeys.add(stem); + for (String variant : variants) { + dictionaryBuilder.append(' ').append(variant); + lookupKeys.add(variant); + } + + final String homograph = createHomograph(index); + dictionaryBuilder.append(' ').append(homograph); + lookupKeys.add(homograph); + ambiguousLookupKeys.add(homograph); + + dictionaryBuilder.append('\n'); + } + + return new BenchmarkCorpus( + dictionaryBuilder.toString(), + lookupKeys.toArray(String[]::new), + ambiguousLookupKeys.toArray(String[]::new)); + } + + /** + * Builds a compiled trie from benchmark corpus text. + * + * @param corpusText line-oriented dictionary text + * @param reductionSettings reduction settings + * @param storeOriginalStem whether the canonical stem itself should also be + * inserted with the no-op patch + * @return compiled trie containing patch commands + * @throws IOException if parsing fails + */ + static FrequencyTrie compilePatchTrie( + final String corpusText, + final ReductionSettings reductionSettings, + final boolean storeOriginalStem) throws IOException { + Objects.requireNonNull(corpusText, "corpusText"); + Objects.requireNonNull(reductionSettings, "reductionSettings"); + + final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings); + final PatchCommandEncoder encoder = new PatchCommandEncoder(); + + StemmerDictionaryParser.parse( + new StringReader(corpusText), + "benchmark-corpus", + (stem, variants, lineNumber) -> { + if (storeOriginalStem) { + builder.put(stem, encoder.encode(stem, stem)); + } + for (String variant : variants) { + builder.put(variant, encoder.encode(variant, stem)); + } + }); + + return builder.build(); + } + + /** + * Creates one deterministic stem. + * + * @param index stem ordinal + * @return generated stem + */ + private static String createStem(final int index) { + final String prefix = PREFIXES[index % PREFIXES.length]; + final String suffix = STEM_SUFFIXES[(index / PREFIXES.length) % STEM_SUFFIXES.length]; + return (prefix + suffix + base36(index)).toLowerCase(Locale.ROOT); + } + + /** + * Creates a set of deterministic variants for one stem. + * + * @param stem canonical stem + * @param random deterministic random source + * @param index stem ordinal + * @return generated variants in stable order + */ + private static String[] createVariants(final String stem, final SplittableRandom random, final int index) { + final List variants = new ArrayList<>(8); + variants.add(stem + "s"); + variants.add(stem + "ed"); + variants.add(stem + "ing"); + variants.add(stem + "er"); + variants.add(stem + "ers"); + variants.add("pre" + stem); + variants.add(stem + random.nextInt(10)); + + if ((index & 1) == 0) { + variants.add(stem + "ly"); + } + if (stem.length() > 5) { + variants.add(stem.substring(0, stem.length() - 1)); + } + return variants.toArray(String[]::new); + } + + /** + * Creates an ambiguous surface form shared by a small group of stems. + * + * @param index stem ordinal + * @return shared homograph form + */ + private static String createHomograph(final int index) { + return "shared" + base36(index / HOMOGRAPH_GROUP_SIZE); + } + + /** + * Converts an ordinal into a compact base-36 discriminator. + * + * @param value numeric value + * @return compact discriminator + */ + private static String base36(final int value) { + return Integer.toString(value, Character.MAX_RADIX); + } + + /** + * Immutable benchmark corpus. + * + * @param dictionaryText full line-oriented dictionary text + * @param lookupKeys keys used for general lookup measurements + * @param ambiguousLookupKeys keys that return multiple patch candidates from + * {@code getAll()} + */ + record BenchmarkCorpus(String dictionaryText, String[] lookupKeys, String[] ambiguousLookupKeys) { + } +} diff --git a/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieCompilationBenchmark.java b/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieCompilationBenchmark.java new file mode 100644 index 0000000..d77b461 --- /dev/null +++ b/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieCompilationBenchmark.java @@ -0,0 +1,94 @@ +package org.egothor.stemmer.benchmark; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; +import org.egothor.stemmer.ReductionMode; +import org.egothor.stemmer.ReductionSettings; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * Benchmarks end-to-end dictionary compilation for different reduction modes. + * + *

+ * This benchmark measures the offline path that matters for dictionary build + * workflows: dictionary parsing, patch-command generation, mutable trie + * population, subtree reduction, and freezing into the compiled read-only trie. + *

+ */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@Warmup(iterations = 3, time = 1) +@Measurement(iterations = 5, time = 1) +public class FrequencyTrieCompilationBenchmark { + + /** + * Shared benchmark state for compilation scenarios. + */ + @State(Scope.Benchmark) + public static class CompilationState { + + /** + * Number of canonical stems to generate. + */ + @Param({ "2000", "10000" }) + public int stemCount; + + /** + * Reduction mode used during trie compilation. + */ + @Param({ + "MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS", + "MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS", + "MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS" + }) + public String reductionMode; + + /** + * Whether to store the stem itself using the canonical no-op patch. + */ + @Param({ "true", "false" }) + public boolean storeOriginalStem; + + /** + * Full dictionary text used as the benchmark input. + */ + private String dictionaryText; + + /** + * Initializes the benchmark state. + */ + @Setup(Level.Trial) + public void setUp() { + this.dictionaryText = BenchmarkCorpusSupport.createCorpus(this.stemCount).dictionaryText(); + } + } + + /** + * Measures end-to-end patch trie compilation latency. + * + * @param state prepared compilation state + * @param blackhole sink preventing dead-code elimination + * @throws IOException if dictionary parsing fails + */ + @Benchmark + public void compilePatchTrie(final CompilationState state, final Blackhole blackhole) throws IOException { + final ReductionSettings settings = + ReductionSettings.withDefaults(ReductionMode.valueOf(state.reductionMode)); + blackhole.consume( + BenchmarkCorpusSupport.compilePatchTrie( + state.dictionaryText, + settings, + state.storeOriginalStem)); + } +} diff --git a/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieLookupBenchmark.java b/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieLookupBenchmark.java new file mode 100644 index 0000000..3325af3 --- /dev/null +++ b/src/jmh/java/org/egothor/stemmer/benchmark/FrequencyTrieLookupBenchmark.java @@ -0,0 +1,160 @@ +package org.egothor.stemmer.benchmark; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; +import org.egothor.stemmer.FrequencyTrie; +import org.egothor.stemmer.PatchCommandEncoder; +import org.egothor.stemmer.ReductionMode; +import org.egothor.stemmer.ReductionSettings; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * Benchmarks lookup-oriented operations on compiled Radixor tries. + * + *

+ * The benchmark uses a deterministic morphology-shaped corpus and measures the + * latency of the hot-path lookup operations that are relevant at runtime: + * retrieving the preferred patch command, retrieving all candidate patch + * commands, and reconstructing stems from the returned patch values. + *

+ */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Warmup(iterations = 3, time = 1) +@Measurement(iterations = 5, time = 1) +public class FrequencyTrieLookupBenchmark { + + /** + * Shared benchmark state for lookup scenarios. + */ + @State(Scope.Benchmark) + public static class LookupState { + + /** + * Number of canonical stems to generate. + */ + @Param({ "2000", "10000" }) + public int stemCount; + + /** + * Reduction mode used to compile the lookup trie. + */ + @Param({ + "MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS", + "MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS", + "MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS" + }) + public String reductionMode; + + /** + * Compiled trie under test. + */ + private FrequencyTrie trie; + + /** + * Deterministic lookup keys. + */ + private String[] lookupKeys; + + /** + * Keys that are known to return multiple patch candidates from + * {@code getAll()}. + */ + private String[] ambiguousLookupKeys; + + /** + * Initializes the benchmark state. + * + * @throws IOException if corpus compilation fails + */ + @Setup(Level.Trial) + public void setUp() throws IOException { + final BenchmarkCorpusSupport.BenchmarkCorpus corpus = BenchmarkCorpusSupport.createCorpus(this.stemCount); + final ReductionSettings settings = + ReductionSettings.withDefaults(ReductionMode.valueOf(this.reductionMode)); + this.trie = BenchmarkCorpusSupport.compilePatchTrie(corpus.dictionaryText(), settings, true); + this.lookupKeys = corpus.lookupKeys(); + this.ambiguousLookupKeys = corpus.ambiguousLookupKeys(); + } + } + + /** + * Measures preferred patch lookup latency. + * + * @param state prepared lookup state + * @param blackhole sink preventing dead-code elimination + */ + @Benchmark + public void lookupPreferredPatch(final LookupState state, final Blackhole blackhole) { + final String[] keys = state.lookupKeys; + for (String key : keys) { + final String patch = state.trie.get(key); + if (patch == null) { + throw new IllegalStateException("Missing preferred patch for key " + key + '.'); + } + blackhole.consume(patch); + } + } + + /** + * Measures retrieval of all patch candidates on ambiguous forms. + * + * @param state prepared lookup state + * @param blackhole sink preventing dead-code elimination + */ + @Benchmark + public void lookupAllPatches(final LookupState state, final Blackhole blackhole) { + final String[] keys = state.ambiguousLookupKeys; + for (String key : keys) { + final String[] patches = state.trie.getAll(key); + if (patches.length < 2) { + throw new IllegalStateException("Expected multiple patches for key " + key + '.'); + } + blackhole.consume(patches); + } + } + + /** + * Measures end-to-end preferred stemming from lookup plus patch application. + * + * @param state prepared lookup state + * @param blackhole sink preventing dead-code elimination + */ + @Benchmark + public void stemPreferredVariant(final LookupState state, final Blackhole blackhole) { + final String[] keys = state.lookupKeys; + for (String key : keys) { + final String patch = state.trie.get(key); + blackhole.consume(PatchCommandEncoder.apply(key, patch)); + } + } + + /** + * Measures end-to-end full candidate stemming from {@code getAll()} plus + * patch application. + * + * @param state prepared lookup state + * @param blackhole sink preventing dead-code elimination + */ + @Benchmark + public void stemAllVariants(final LookupState state, final Blackhole blackhole) { + final String[] keys = state.ambiguousLookupKeys; + for (String key : keys) { + final String[] patches = state.trie.getAll(key); + for (String patch : patches) { + blackhole.consume(PatchCommandEncoder.apply(key, patch)); + } + } + } +} diff --git a/src/jmh/java/org/egothor/stemmer/benchmark/package-info.java b/src/jmh/java/org/egothor/stemmer/benchmark/package-info.java new file mode 100644 index 0000000..592bf60 --- /dev/null +++ b/src/jmh/java/org/egothor/stemmer/benchmark/package-info.java @@ -0,0 +1,10 @@ +/** + * JMH benchmarks for the Radixor algorithmic core. + * + *

+ * The benchmarks in this package focus on trie lookup latency, retrieval of all + * candidate patch commands, and end-to-end dictionary compilation with + * different reduction modes. + *

+ */ +package org.egothor.stemmer.benchmark;