test to see if ai-generated fuzzy search is better

2025-12-29 23:14:52 +01:00
parent adfb3a9fb2
commit 5b765f9a7d
2 changed files with 100 additions and 34 deletions
--- a/src/main/java/nl/herpiederpiee/appie_scraper/BonusManager.java
+++ b/src/main/java/nl/herpiederpiee/appie_scraper/BonusManager.java
@@ -65,59 +65,104 @@ public class BonusManager {

    public static int fuzzyMatchScore(String query, String title) {
        if (query == null || title == null || query.isEmpty() || title.isEmpty()) {
-            return query == null ? (title == null ? 100 : 0) : (title.isEmpty() ? 100 : 0);
+            return 0;
        }

-        // Normalize both strings: remove diacritics, lowercase, remove special chars
        String normalizedQuery = normalize(query);
        String normalizedTitle = normalize(title);

-        // Exact match after normalization
-        if (normalizedTitle.equals(normalizedQuery)) {
+        // ===== TIER 1: EXACT WORD MATCH (highest priority) =====
+        if (isExactWordMatch(normalizedTitle, normalizedQuery)) {
            return 100;
        }

-        // Substring match (query is contained in title)
-        if (normalizedTitle.contains(normalizedQuery)) {
-            return 95; // Very high score but slightly less than exact
+        // ===== TIER 2: WORD-BOUNDARY SUBSTRING =====
+        if (isWordBoundaryMatch(normalizedTitle, normalizedQuery)) {
+            return 95;
        }

+        // ===== TIER 3: PREFIX MATCH =====
+        if (isPrefixMatch(normalizedTitle, normalizedQuery)) {
+            return 85;
+        }
+
+        // ===== TIER 4: LEVENSHTEIN (typo tolerance) =====
        int qlen = normalizedQuery.length();
        int tlen = normalizedTitle.length();

-        // Query longer than title - impossible match
        if (qlen > tlen) {
            return 0;
        }

-        // Find the best matching substring using Levenshtein distance
        int bestDistance = Integer.MAX_VALUE;
-        int bestPosition = -1;

        for (int i = 0; i <= tlen - qlen; i++) {
            String sub = normalizedTitle.substring(i, i + qlen);
-            int dist = LevenshteinDistance.calculate(normalizedQuery, sub);
-
+            int dist = levenshteinDistance(normalizedQuery, sub);
            if (dist < bestDistance) {
                bestDistance = dist;
-                bestPosition = i;
-
-                if (dist == 0) break; // Perfect match found, can't do better
+                if (dist == 0) break;
            }
        }

-        // Calculate score: 100% at distance 0, scales down with distance
-        // Normalize by query length for consistency
-        double similarity = 1.0 - (bestDistance / (double) qlen);
-
-        // Apply position bonus: matches at the start are better
-        if (bestPosition == 0) {
-            similarity *= 1.1; // 10% boost for start matches
+        // Allow up to 2 edits (typo tolerance)
+        if (bestDistance <= 2) {
+            // Distance 0 = 80, Distance 1 = 70, Distance 2 = 60
+            int score = 80 - (bestDistance * 10);
+            return Math.max(0, score);
        }

-        // Clamp to 0-100
-        int score = (int) (similarity * 100.0);
-        return Math.max(0, Math.min(100, score));
+        return 0;
+    }
+
+    /**
+     * Exact word match: query must be surrounded by word boundaries or string edges
+     * "LOR" matches "L OR" or "LOR coffee" but NOT "LOREAL"
+     */
+    private static boolean isExactWordMatch(String title, String query) {
+        String[] words = title.split("\\s+");
+        for (String word : words) {
+            if (word.equals(query)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Word boundary match: query matches at word start/end
+     * "LOR" matches in "L'OR" (after special char removed)
+     * "REAL" matches in "LOREAL" as word boundary? No, stays in Tier 4
+     */
+    private static boolean isWordBoundaryMatch(String title, String query) {
+        // Check if query appears after space or at start
+        if (title.startsWith(query + " ")) {
+            return true;
+        }
+
+        if (title.contains(" " + query)) {
+            return true;
+        }
+
+        // Check if query ends at word boundary
+        if (title.endsWith(" " + query)) {
+            return true;
+        }
+
+        return false;
+    }
+
+    /**
+     * Prefix match: query is the start of any word
+     * "CAF" matches in "CAFFE" or "CAFE LATTE"
+     */
+    private static boolean isPrefixMatch(String title, String query) {
+        for (String word : title.split("\\s+")) {
+            if (word.startsWith(query) && word.length() > query.length()) {
+                return true;
+            }
+        }
+        return false;
    }

    private static String normalize(String input) {
@@ -125,14 +170,12 @@ public class BonusManager {
            return input;
        }

-        // Unicode decomposition: separate base chars from diacritics
+        // Remove diacritics
        String decomposed = Normalizer.normalize(input, Normalizer.Form.NFD);
-
-        // Remove all combining diacritical marks
        String withoutDiacritics = decomposed
                .replaceAll("\\p{InCombiningDiacriticalMarks}+", "");

-        // Lowercase and remove special characters (keep alphanumeric + spaces)
+        // Lowercase, remove special chars, normalize spaces
        String cleaned = withoutDiacritics
                .toLowerCase()
                .replaceAll("[^a-z0-9\\s]", "")
@@ -141,4 +184,32 @@ public class BonusManager {

        return cleaned;
    }
+
+    private static int levenshteinDistance(String query, String title) {
+        int qlen = query.length();
+        int tlen = title.length();
+
+        if (qlen == 0) return tlen;
+        if (tlen == 0) return qlen;
+
+        int[][] dp = new int[qlen + 1][tlen + 1];
+
+        for (int i = 0; i <= qlen; i++) dp[i][0] = i;
+        for (int j = 0; j <= tlen; j++) dp[0][j] = j;
+
+        for (int i = 1; i <= qlen; i++) {
+            for (int j = 1; j <= tlen; j++) {
+                if (query.charAt(i - 1) == title.charAt(j - 1)) {
+                    dp[i][j] = dp[i - 1][j - 1];
+                } else {
+                    dp[i][j] = 1 + Math.min(
+                            Math.min(dp[i - 1][j - 1], dp[i - 1][j]),
+                            dp[i][j - 1]
+                    );
+                }
+            }
+        }
+
+        return dp[qlen][tlen];
+    }
 }