test to see if ai-generated fuzzy search is better

2025-12-29 23:09:55 +01:00
parent b619afe24f
commit adfb3a9fb2
1 changed files with 69 additions and 12 deletions
--- a/src/main/java/nl/herpiederpiee/appie_scraper/BonusManager.java
+++ b/src/main/java/nl/herpiederpiee/appie_scraper/BonusManager.java
@@ -3,6 +3,7 @@ package nl.herpiederpiee.appie_scraper;
 import com.microsoft.playwright.*;
 import xyz.nextn.levenshteindistance.LevenshteinDistance;

+import java.text.Normalizer;
 import java.util.ArrayList;
 import java.util.concurrent.TimeUnit;

@@ -63,25 +64,81 @@ public class BonusManager {
    }

    public static int fuzzyMatchScore(String query, String title) {
-        query = query.toLowerCase();
-        title = title.toLowerCase();
-
-        if (title.contains(query)) {
-            return 100; // perfect match
+        if (query == null || title == null || query.isEmpty() || title.isEmpty()) {
+            return query == null ? (title == null ? 100 : 0) : (title.isEmpty() ? 100 : 0);
        }

-        int best = Integer.MAX_VALUE;
+        // Normalize both strings: remove diacritics, lowercase, remove special chars
+        String normalizedQuery = normalize(query);
+        String normalizedTitle = normalize(title);

-        int qlen = query.length();
-        int tlen = title.length();
+        // Exact match after normalization
+        if (normalizedTitle.equals(normalizedQuery)) {
+            return 100;
+        }
+
+        // Substring match (query is contained in title)
+        if (normalizedTitle.contains(normalizedQuery)) {
+            return 95; // Very high score but slightly less than exact
+        }
+
+        int qlen = normalizedQuery.length();
+        int tlen = normalizedTitle.length();
+
+        // Query longer than title - impossible match
+        if (qlen > tlen) {
+            return 0;
+        }
+
+        // Find the best matching substring using Levenshtein distance
+        int bestDistance = Integer.MAX_VALUE;
+        int bestPosition = -1;

        for (int i = 0; i <= tlen - qlen; i++) {
-            String sub = title.substring(i, i + qlen);
-            int dist = LevenshteinDistance.calculate(query, sub);
-            if (dist < best) best = dist;
+            String sub = normalizedTitle.substring(i, i + qlen);
+            int dist = LevenshteinDistance.calculate(normalizedQuery, sub);
+
+            if (dist < bestDistance) {
+                bestDistance = dist;
+                bestPosition = i;
+
+                if (dist == 0) break; // Perfect match found, can't do better
+            }
        }

-        int score = (int)(100.0 * (1.0 - (best / (double) qlen))); // fancy manier om t naar een % match om te zetten
+        // Calculate score: 100% at distance 0, scales down with distance
+        // Normalize by query length for consistency
+        double similarity = 1.0 - (bestDistance / (double) qlen);
+
+        // Apply position bonus: matches at the start are better
+        if (bestPosition == 0) {
+            similarity *= 1.1; // 10% boost for start matches
+        }
+
+        // Clamp to 0-100
+        int score = (int) (similarity * 100.0);
        return Math.max(0, Math.min(100, score));
    }
+
+    private static String normalize(String input) {
+        if (input == null || input.isEmpty()) {
+            return input;
+        }
+
+        // Unicode decomposition: separate base chars from diacritics
+        String decomposed = Normalizer.normalize(input, Normalizer.Form.NFD);
+
+        // Remove all combining diacritical marks
+        String withoutDiacritics = decomposed
+                .replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
+
+        // Lowercase and remove special characters (keep alphanumeric + spaces)
+        String cleaned = withoutDiacritics
+                .toLowerCase()
+                .replaceAll("[^a-z0-9\\s]", "")
+                .replaceAll("\\s+", " ")
+                .trim();
+
+        return cleaned;
+    }
 }