diff --git a/gradle.lockfile b/gradle.lockfile index 2399250..0b70d2e 100644 --- a/gradle.lockfile +++ b/gradle.lockfile @@ -7,11 +7,11 @@ com.google.code.gson:gson:2.13.2=pmd com.google.errorprone:error_prone_annotations:2.41.0=pmd net.bytebuddy:byte-buddy-agent:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath net.bytebuddy:byte-buddy:1.17.7=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath -net.jqwik:jqwik-api:1.9.3=testCompileClasspath,testRuntimeClasspath -net.jqwik:jqwik-engine:1.9.3=testRuntimeClasspath -net.jqwik:jqwik-time:1.9.3=testCompileClasspath,testRuntimeClasspath -net.jqwik:jqwik-web:1.9.3=testCompileClasspath,testRuntimeClasspath -net.jqwik:jqwik:1.9.3=testCompileClasspath,testRuntimeClasspath +net.jqwik:jqwik-api:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath +net.jqwik:jqwik-engine:1.9.3=jmhRuntimeClasspath,testRuntimeClasspath +net.jqwik:jqwik-time:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath +net.jqwik:jqwik-web:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath +net.jqwik:jqwik:1.9.3=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath net.sf.jopt-simple:jopt-simple:4.9=pitest net.sf.jopt-simple:jopt-simple:5.0.4=jmh,jmhCompileClasspath,jmhRuntimeClasspath net.sf.saxon:Saxon-HE:12.9=pmd @@ -24,7 +24,7 @@ org.apache.commons:commons-lang3:3.18.0=pitest org.apache.commons:commons-lang3:3.20.0=pmd org.apache.commons:commons-math3:3.6.1=jmh,jmhCompileClasspath,jmhRuntimeClasspath org.apache.commons:commons-text:1.14.0=pitest -org.apiguardian:apiguardian-api:1.1.2=testCompileClasspath,testRuntimeClasspath +org.apiguardian:apiguardian-api:1.1.2=jmhRuntimeClasspath,testCompileClasspath,testRuntimeClasspath org.checkerframework:checker-qual:3.52.1=pmd org.jacoco:org.jacoco.agent:0.8.14=jacocoAgent,jacocoAnt org.jacoco:org.jacoco.ant:0.8.14=jacocoAnt diff --git a/src/main/java/org/egothor/stemmer/FrequencyTrie.java b/src/main/java/org/egothor/stemmer/FrequencyTrie.java index 03d9d40..2adfb14 100644 --- a/src/main/java/org/egothor/stemmer/FrequencyTrie.java +++ b/src/main/java/org/egothor/stemmer/FrequencyTrie.java @@ -426,6 +426,8 @@ public final class FrequencyTrie { childNodeIds[edgeIndex] = dataInput.readInt(); } + validateSerializedEdges(nodeIndex, edgeLabels); + final int valueCount = dataInput.readInt(); if (valueCount < 0) { throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount); @@ -474,6 +476,28 @@ public final class FrequencyTrie { return nodes; } + /** + * Validates the serialized edge-label sequence for one node. + * + *

+ * Compiled nodes rely on binary search for child lookup and therefore require + * edge labels to be stored in strict ascending order without duplicates. + * Rejecting malformed streams here keeps lookup semantics deterministic and + * avoids silently constructing a trie whose search behavior would be undefined. + * + * @param nodeIndex serialized node identifier + * @param edgeLabels serialized edge labels + * @throws IOException if the edge labels are not strictly ascending + */ + private static void validateSerializedEdges(final int nodeIndex, final char... edgeLabels) throws IOException { + for (int edgeIndex = 1; edgeIndex < edgeLabels.length; edgeIndex++) { + if (edgeLabels[edgeIndex - 1] >= edgeLabels[edgeIndex]) { + throw new IOException("Edge labels must be strictly ascending at node " + nodeIndex + ", edge index " + + edgeIndex + ": '" + edgeLabels[edgeIndex - 1] + "' then '" + edgeLabels[edgeIndex] + "'."); + } + } + } + /** * Locates the compiled node for the supplied key. * diff --git a/src/main/java/org/egothor/stemmer/PatchCommandEncoder.java b/src/main/java/org/egothor/stemmer/PatchCommandEncoder.java index a91aa99..c28143c 100644 --- a/src/main/java/org/egothor/stemmer/PatchCommandEncoder.java +++ b/src/main/java/org/egothor/stemmer/PatchCommandEncoder.java @@ -117,7 +117,14 @@ public final class PatchCommandEncoder { private static final int MISMATCH_PENALTY = 100; /** - * Extra headroom added when internal matrices need to grow. + * Extra matrix headroom reserved beyond the immediately required dimensions. + * + *

+ * A small fixed margin reduces repeated reallocation when a caller encodes many + * similarly sized terms in sequence. The value is intentionally modest: large + * enough to absorb minor size fluctuations, yet small enough to avoid + * materially over-allocating the reused dynamic-programming matrices. + *

*/ private static final int CAPACITY_MARGIN = 8; @@ -288,6 +295,7 @@ public final class PatchCommandEncoder { * @param patchCommand compact patch command * @return transformed word, or {@code null} when {@code source} is {@code null} */ + @SuppressWarnings({ "PMD.CyclomaticComplexity", "PMD.AvoidLiteralsInIfCondition" }) public static String apply(String source, String patchCommand) { if (source == null) { return null; @@ -299,6 +307,10 @@ public final class PatchCommandEncoder { return source; } + if ((patchCommand.length() & 1) != 0) { + return source; + } + StringBuilder result = new StringBuilder(source); if (result.isEmpty()) { @@ -312,11 +324,14 @@ public final class PatchCommandEncoder { char opcode = patchCommand.charAt(patchIndex); char argument = patchCommand.charAt(patchIndex + 1); - int encodedCount = argument - 'a' + 1; switch (opcode) { case SKIP_OPCODE: - position = position - encodedCount + 1; + final int skipCount = decodeEncodedCount(argument); + if (skipCount < 1) { + return source; + } + position = position - skipCount + 1; break; case REPLACE_OPCODE: @@ -324,8 +339,12 @@ public final class PatchCommandEncoder { break; case DELETE_OPCODE: + final int deleteCount = decodeEncodedCount(argument); + if (deleteCount < 1) { + return source; + } int deleteEndExclusive = position + 1; - position -= encodedCount - 1; + position -= deleteCount - 1; result.delete(position, deleteEndExclusive); break; @@ -353,6 +372,26 @@ public final class PatchCommandEncoder { return result.toString(); } + /** + * Decodes a compact count argument used by skip and delete instructions. + * + *

+ * Valid encoded counts start at {@code 'a'} for one affected character. Values + * below {@code 'a'} are malformed and are reported to callers via the + * compatibility fallback path rather than by throwing a dedicated exception. + *

+ * + * @param argument serialized count argument + * @return decoded positive count, or {@code -1} when the argument is malformed + */ + @SuppressWarnings("PMD.AvoidLiteralsInIfCondition") + private static int decodeEncodedCount(final char argument) { + if (argument < 'a') { + return -1; + } + return argument - 'a' + 1; + } + /** * Applies a patch command to an empty source word. * diff --git a/src/main/java/org/egothor/stemmer/trie/CompiledNode.java b/src/main/java/org/egothor/stemmer/trie/CompiledNode.java index 6612cd8..eb5d567 100644 --- a/src/main/java/org/egothor/stemmer/trie/CompiledNode.java +++ b/src/main/java/org/egothor/stemmer/trie/CompiledNode.java @@ -31,6 +31,7 @@ package org.egothor.stemmer.trie; import java.util.Arrays; +import java.util.Objects; /** * Immutable compiled trie node optimized for read access. @@ -38,7 +39,9 @@ import java.util.Arrays; *

* The returned arrays are the internal backing storage of the compiled node. * They are exposed for efficient access by closely related trie infrastructure - * and therefore must never be modified by callers. + * and therefore must never be modified by callers. The node itself is still + * immutable from the public API perspective because construction wires these + * arrays once and all lookup operations thereafter treat them as read-only. * * @param value type * @param edgeLabels internal edge label array @@ -46,8 +49,90 @@ import java.util.Arrays; * @param orderedValues internal ordered values array * @param orderedCounts internal ordered counts array */ +@SuppressWarnings("PMD.DataClass") public record CompiledNode(char[] edgeLabels, CompiledNode[] children, V[] orderedValues, int... orderedCounts) { + /** + * Creates one validated compiled node. + * + * @throws NullPointerException if any array argument is {@code null} + * @throws IllegalArgumentException if the edge-related arrays or value-related + * arrays do not have matching lengths + */ + public CompiledNode { + Objects.requireNonNull(edgeLabels, "edgeLabels"); + Objects.requireNonNull(children, "children"); + Objects.requireNonNull(orderedValues, "orderedValues"); + Objects.requireNonNull(orderedCounts, "orderedCounts"); + + if (edgeLabels.length != children.length) { + throw new IllegalArgumentException("edgeLabels and children must have the same length."); + } + if (orderedValues.length != orderedCounts.length) { + throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length."); + } + } + + /** + * Returns the internal edge-label array. + * + *

+ * The returned array is not copied for performance reasons and must be treated + * as read-only. + * + * @return internal edge-label array + */ + @Override + @SuppressWarnings("PMD.MethodReturnsInternalArray") + public char[] edgeLabels() { + return this.edgeLabels; + } + + /** + * Returns the internal child-node array. + * + *

+ * The returned array is not copied for performance reasons and must be treated + * as read-only by external callers. + * + * @return internal child-node array + */ + @Override + @SuppressWarnings("PMD.MethodReturnsInternalArray") + public CompiledNode[] children() { + return this.children; + } + + /** + * Returns the internal ordered-values array. + * + *

+ * The returned array is not copied for performance reasons and must be treated + * as read-only. + * + * @return internal ordered-values array + */ + @Override + @SuppressWarnings("PMD.MethodReturnsInternalArray") + public V[] orderedValues() { + return this.orderedValues; + } + + /** + * Returns the internal ordered-counts array. + * + *

+ * The returned array is not copied for performance reasons and must be treated + * as read-only. + * + * @return internal ordered-counts array + */ + @Override + @SuppressWarnings("PMD.MethodReturnsInternalArray") + public int[] orderedCounts() { + return this.orderedCounts; + } + /** * Finds a child for the supplied edge character. * diff --git a/src/main/java/org/egothor/stemmer/trie/NodeData.java b/src/main/java/org/egothor/stemmer/trie/NodeData.java index 6a8aa76..03fc601 100644 --- a/src/main/java/org/egothor/stemmer/trie/NodeData.java +++ b/src/main/java/org/egothor/stemmer/trie/NodeData.java @@ -30,14 +30,18 @@ ******************************************************************************/ package org.egothor.stemmer.trie; +import java.util.Objects; + /** * Intermediate node data used during deserialization before child references * are resolved. * *

* The arrays exposed by the accessors are the internal backing storage of this - * holder. They are returned directly for efficiency and therefore must be - * treated as read-only by callers. + * holder. They are returned directly for efficiency because the deserialization + * pipeline copies references into immutable compiled nodes immediately after + * the record is created. Callers must therefore treat every returned array as + * read-only. * * @param value type * @param edgeLabels edge labels @@ -45,6 +49,87 @@ package org.egothor.stemmer.trie; * @param orderedValues ordered values * @param orderedCounts ordered counts */ +@SuppressWarnings("PMD.DataClass") public record NodeData(char[] edgeLabels, int[] childNodeIds, V[] orderedValues, int... orderedCounts) { + /** + * Creates one validated node-data holder. + * + * @throws NullPointerException if any array argument is {@code null} + * @throws IllegalArgumentException if the edge-related arrays or value-related + * arrays do not have matching lengths + */ + public NodeData { + Objects.requireNonNull(edgeLabels, "edgeLabels"); + Objects.requireNonNull(childNodeIds, "childNodeIds"); + Objects.requireNonNull(orderedValues, "orderedValues"); + Objects.requireNonNull(orderedCounts, "orderedCounts"); + + if (edgeLabels.length != childNodeIds.length) { + throw new IllegalArgumentException("edgeLabels and childNodeIds must have the same length."); + } + if (orderedValues.length != orderedCounts.length) { + throw new IllegalArgumentException("orderedValues and orderedCounts must have the same length."); + } + } + + /** + * Returns the internal edge-label array. + * + *

+ * The returned array is not copied for performance reasons and must be treated + * as read-only. + * + * @return internal edge-label array + */ + @Override + @SuppressWarnings("PMD.MethodReturnsInternalArray") + public char[] edgeLabels() { + return this.edgeLabels; + } + + /** + * Returns the internal child-node identifier array. + * + *

+ * The returned array is not copied for performance reasons and must be treated + * as read-only. + * + * @return internal child-node identifier array + */ + @Override + @SuppressWarnings("PMD.MethodReturnsInternalArray") + public int[] childNodeIds() { + return this.childNodeIds; + } + + /** + * Returns the internal ordered-values array. + * + *

+ * The returned array is not copied for performance reasons and must be treated + * as read-only. + * + * @return internal ordered-values array + */ + @Override + @SuppressWarnings("PMD.MethodReturnsInternalArray") + public V[] orderedValues() { + return this.orderedValues; + } + + /** + * Returns the internal ordered-counts array. + * + *

+ * The returned array is not copied for performance reasons and must be treated + * as read-only. + * + * @return internal ordered-counts array + */ + @Override + @SuppressWarnings("PMD.MethodReturnsInternalArray") + public int[] orderedCounts() { + return this.orderedCounts; + } } diff --git a/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java b/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java index 8e9ba59..c37321d 100644 --- a/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java +++ b/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java @@ -733,6 +733,30 @@ class FrequencyTrieTest { assertTrue(exception.getMessage().contains("Invalid root node id")); } + /** + * Verifies that deserialization rejects unsorted or duplicate serialized edge + * labels because compiled lookup relies on binary search over a strictly + * ascending edge array. + */ + @Test + @Tag("persistence") + @DisplayName("readFrom rejects non-ascending serialized edge labels") + void readFromRejectsNonAscendingSerializedEdgeLabels() { + final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> { + dataOutput.writeInt(2); + dataOutput.writeChar('b'); + dataOutput.writeInt(0); + dataOutput.writeChar('a'); + dataOutput.writeInt(0); + dataOutput.writeInt(0); + } }); + + final IOException exception = assertThrows(IOException.class, + () -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC)); + + assertTrue(exception.getMessage().contains("Edge labels must be strictly ascending")); + } + /** * Verifies that deserialization rejects non-positive stored counts. */ diff --git a/src/test/java/org/egothor/stemmer/PatchCommandEncoderTest.java b/src/test/java/org/egothor/stemmer/PatchCommandEncoderTest.java index e0ab19c..cff44eb 100644 --- a/src/test/java/org/egothor/stemmer/PatchCommandEncoderTest.java +++ b/src/test/java/org/egothor/stemmer/PatchCommandEncoderTest.java @@ -174,7 +174,13 @@ class PatchCommandEncoderTest { // 9 Arguments.of(9, "", "-a"), // 10 - Arguments.of(10, "", "Ra")); + Arguments.of(10, "", "Ra"), + // 11 + Arguments.of(11, "abc", "D`"), + // 12 + Arguments.of(12, "abc", "-`"), + // 13 + Arguments.of(13, "", "D`")); } /** diff --git a/src/test/java/org/egothor/stemmer/trie/CompiledNodeAndNodeDataTest.java b/src/test/java/org/egothor/stemmer/trie/CompiledNodeAndNodeDataTest.java new file mode 100644 index 0000000..c33f5b8 --- /dev/null +++ b/src/test/java/org/egothor/stemmer/trie/CompiledNodeAndNodeDataTest.java @@ -0,0 +1,148 @@ +/******************************************************************************* + * Copyright (C) 2026, Leo Galambos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software must + * display the following acknowledgement: + * This product includes software developed by the Egothor project. + * + * 4. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************/ +package org.egothor.stemmer.trie; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link CompiledNode} and {@link NodeData} validation and + * documented backing-array exposure. + */ +@Tag("unit") +@Tag("fast") +@Tag("trie") +@DisplayName("CompiledNode and NodeData") +class CompiledNodeAndNodeDataTest { + + /** + * Verifies that {@link NodeData} rejects mismatched edge-related array lengths. + */ + @Test + @DisplayName("NodeData rejects mismatched edge arrays") + void nodeDataShouldRejectMismatchedEdgeArrays() { + final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, + () -> new NodeData(new char[] { 'a' }, new int[0], new String[0], new int[0])); + + assertEquals("edgeLabels and childNodeIds must have the same length.", exception.getMessage()); + } + + /** + * Verifies that {@link NodeData} rejects mismatched value-related array + * lengths. + */ + @Test + @DisplayName("NodeData rejects mismatched value arrays") + void nodeDataShouldRejectMismatchedValueArrays() { + final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, + () -> new NodeData(new char[0], new int[0], new String[] { "stem" }, new int[0])); + + assertEquals("orderedValues and orderedCounts must have the same length.", exception.getMessage()); + } + + /** + * Verifies that {@link NodeData} continues to expose the documented backing + * arrays directly. + */ + @Test + @DisplayName("NodeData accessors expose documented backing arrays") + void nodeDataAccessorsShouldExposeDocumentedBackingArrays() { + final char[] edgeLabels = new char[] { 'a' }; + final int[] childNodeIds = new int[] { 7 }; + final String[] orderedValues = new String[] { "stem" }; + final int[] orderedCounts = new int[] { 3 }; + final NodeData nodeData = new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts); + + assertSame(edgeLabels, nodeData.edgeLabels()); + assertSame(childNodeIds, nodeData.childNodeIds()); + assertSame(orderedValues, nodeData.orderedValues()); + assertSame(orderedCounts, nodeData.orderedCounts()); + } + + /** + * Verifies that {@link CompiledNode} rejects mismatched edge and child arrays. + */ + @Test + @DisplayName("CompiledNode rejects mismatched edge and child arrays") + void compiledNodeShouldRejectMismatchedEdgeAndChildArrays() { + @SuppressWarnings("unchecked") + final CompiledNode[] children = new CompiledNode[0]; + + final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, + () -> new CompiledNode(new char[] { 'a' }, children, new String[0], new int[0])); + + assertEquals("edgeLabels and children must have the same length.", exception.getMessage()); + } + + /** + * Verifies that {@link CompiledNode} rejects mismatched value arrays. + */ + @Test + @DisplayName("CompiledNode rejects mismatched value arrays") + void compiledNodeShouldRejectMismatchedValueArrays() { + @SuppressWarnings("unchecked") + final CompiledNode[] children = new CompiledNode[0]; + + final IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, + () -> new CompiledNode(new char[0], children, new String[] { "stem" }, new int[0])); + + assertEquals("orderedValues and orderedCounts must have the same length.", exception.getMessage()); + } + + /** + * Verifies that {@link CompiledNode} continues to expose the documented backing + * arrays directly. + */ + @Test + @DisplayName("CompiledNode accessors expose documented backing arrays") + void compiledNodeAccessorsShouldExposeDocumentedBackingArrays() { + final char[] edgeLabels = new char[] { 'a' }; + @SuppressWarnings("unchecked") + final CompiledNode[] children = new CompiledNode[1]; + final String[] orderedValues = new String[] { "stem" }; + final int[] orderedCounts = new int[] { 5 }; + final CompiledNode node = new CompiledNode<>(edgeLabels, children, orderedValues, orderedCounts); + + assertSame(edgeLabels, node.edgeLabels()); + assertSame(children, node.children()); + assertSame(orderedValues, node.orderedValues()); + assertSame(orderedCounts, node.orderedCounts()); + } +}