test to see if ai-generated fuzzy search is better

2025-12-29 23:14:52 +01:00
parent adfb3a9fb2
commit 5b765f9a7d
2 changed files with 100 additions and 34 deletions
--- a/src/main/java/nl/herpiederpiee/appie_scraper/BonusManager.java
+++ b/src/main/java/nl/herpiederpiee/appie_scraper/BonusManager.java
@@ -65,59 +65,104 @@ public class BonusManager {
    public static int fuzzyMatchScore(String query, String title) {
        if (query == null || title == null || query.isEmpty() || title.isEmpty()) {
-            return query == null ? (title == null ? 100 : 0) : (title.isEmpty() ? 100 : 0);
+            return 0;
        }
        // Normalize both strings: remove diacritics, lowercase, remove special chars
        String normalizedQuery = normalize(query);
        String normalizedTitle = normalize(title);
-        // Exact match after normalization
+        // ===== TIER 1: EXACT WORD MATCH (highest priority) =====
-        if (normalizedTitle.equals(normalizedQuery)) {
+        if (isExactWordMatch(normalizedTitle, normalizedQuery)) {
            return 100;
        }
-        // Substring match (query is contained in title)
+        // ===== TIER 2: WORD-BOUNDARY SUBSTRING =====
-        if (normalizedTitle.contains(normalizedQuery)) {
+        if (isWordBoundaryMatch(normalizedTitle, normalizedQuery)) {
-            return 95; // Very high score but slightly less than exact
+            return 95;
        }
        // ===== TIER 3: PREFIX MATCH =====
        if (isPrefixMatch(normalizedTitle, normalizedQuery)) {
            return 85;
        }
        // ===== TIER 4: LEVENSHTEIN (typo tolerance) =====
        int qlen = normalizedQuery.length();
        int tlen = normalizedTitle.length();
        // Query longer than title - impossible match
        if (qlen > tlen) {
            return 0;
        }
        // Find the best matching substring using Levenshtein distance
        int bestDistance = Integer.MAX_VALUE;
        int bestPosition = -1;
        for (int i = 0; i <= tlen - qlen; i++) {
            String sub = normalizedTitle.substring(i, i + qlen);
-            int dist = LevenshteinDistance.calculate(normalizedQuery, sub);
+            int dist = levenshteinDistance(normalizedQuery, sub);
            if (dist < bestDistance) {
                bestDistance = dist;
-                bestPosition = i;
+                if (dist == 0) break;
                if (dist == 0) break; // Perfect match found, can't do better
            }
        }
-        // Calculate score: 100% at distance 0, scales down with distance
+        // Allow up to 2 edits (typo tolerance)
-        // Normalize by query length for consistency
+        if (bestDistance <= 2) {
-        double similarity = 1.0 - (bestDistance / (double) qlen);
+            // Distance 0 = 80, Distance 1 = 70, Distance 2 = 60
-
+            int score = 80 - (bestDistance * 10);
-        // Apply position bonus: matches at the start are better
+            return Math.max(0, score);
        if (bestPosition == 0) {
            similarity *= 1.1; // 10% boost for start matches
        }
-        // Clamp to 0-100
+        return 0;
-        int score = (int) (similarity * 100.0);
+    }
-        return Math.max(0, Math.min(100, score));
+
    /**
     * Exact word match: query must be surrounded by word boundaries or string edges
     * "LOR" matches "L OR" or "LOR coffee" but NOT "LOREAL"
     */
    private static boolean isExactWordMatch(String title, String query) {
        String[] words = title.split("\\s+");
        for (String word : words) {
            if (word.equals(query)) {
                return true;
            }
        }
        return false;
    }
    /**
     * Word boundary match: query matches at word start/end
     * "LOR" matches in "L'OR" (after special char removed)
     * "REAL" matches in "LOREAL" as word boundary? No, stays in Tier 4
     */
    private static boolean isWordBoundaryMatch(String title, String query) {
        // Check if query appears after space or at start
        if (title.startsWith(query + " ")) {
            return true;
        }
        if (title.contains(" " + query)) {
            return true;
        }
        // Check if query ends at word boundary
        if (title.endsWith(" " + query)) {
            return true;
        }
        return false;
    }
    /**
     * Prefix match: query is the start of any word
     * "CAF" matches in "CAFFE" or "CAFE LATTE"
     */
    private static boolean isPrefixMatch(String title, String query) {
        for (String word : title.split("\\s+")) {
            if (word.startsWith(query) && word.length() > query.length()) {
                return true;
            }
        }
        return false;
    }
    private static String normalize(String input) {
@@ -125,14 +170,12 @@ public class BonusManager {
            return input;
        }
-        // Unicode decomposition: separate base chars from diacritics
+        // Remove diacritics
        String decomposed = Normalizer.normalize(input, Normalizer.Form.NFD);
        // Remove all combining diacritical marks
        String withoutDiacritics = decomposed
                .replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
-        // Lowercase and remove special characters (keep alphanumeric + spaces)
+        // Lowercase, remove special chars, normalize spaces
        String cleaned = withoutDiacritics
                .toLowerCase()
                .replaceAll("[^a-z0-9\\s]", "")
@@ -141,4 +184,32 @@ public class BonusManager {
        return cleaned;
    }
    private static int levenshteinDistance(String query, String title) {
        int qlen = query.length();
        int tlen = title.length();
        if (qlen == 0) return tlen;
        if (tlen == 0) return qlen;
        int[][] dp = new int[qlen + 1][tlen + 1];
        for (int i = 0; i <= qlen; i++) dp[i][0] = i;
        for (int j = 0; j <= tlen; j++) dp[0][j] = j;
        for (int i = 1; i <= qlen; i++) {
            for (int j = 1; j <= tlen; j++) {
                if (query.charAt(i - 1) == title.charAt(j - 1)) {
                    dp[i][j] = dp[i - 1][j - 1];
                } else {
                    dp[i][j] = 1 + Math.min(
                            Math.min(dp[i - 1][j - 1], dp[i - 1][j]),
                            dp[i][j - 1]
                    );
                }
            }
        }
        return dp[qlen][tlen];
    }
 }
--- a/src/main/java/nl/herpiederpiee/appie_scraper/Main.java
+++ b/src/main/java/nl/herpiederpiee/appie_scraper/Main.java
@@ -1,13 +1,8 @@
 package nl.herpiederpiee.appie_scraper;
 import com.microsoft.playwright.*;
 import org.json.*;
 import org.springframework.boot.SpringApplication;
 import org.springframework.boot.autoconfigure.SpringBootApplication;
 import java.util.ArrayList;
 import java.util.Scanner;
@SpringBootApplication
 public class Main {