From 5b765f9a7db067ab5de05a6312967ca271f0d755 Mon Sep 17 00:00:00 2001 From: Valentijn van der Jagt <1119935@hr.nl> Date: Mon, 29 Dec 2025 23:14:52 +0100 Subject: [PATCH] test to see if ai-generated fuzzy search is better --- .../appie_scraper/BonusManager.java | 129 ++++++++++++++---- .../nl/herpiederpiee/appie_scraper/Main.java | 5 - 2 files changed, 100 insertions(+), 34 deletions(-) diff --git a/src/main/java/nl/herpiederpiee/appie_scraper/BonusManager.java b/src/main/java/nl/herpiederpiee/appie_scraper/BonusManager.java index 07c86d9..102235a 100644 --- a/src/main/java/nl/herpiederpiee/appie_scraper/BonusManager.java +++ b/src/main/java/nl/herpiederpiee/appie_scraper/BonusManager.java @@ -65,59 +65,104 @@ public class BonusManager { public static int fuzzyMatchScore(String query, String title) { if (query == null || title == null || query.isEmpty() || title.isEmpty()) { - return query == null ? (title == null ? 100 : 0) : (title.isEmpty() ? 100 : 0); + return 0; } - // Normalize both strings: remove diacritics, lowercase, remove special chars String normalizedQuery = normalize(query); String normalizedTitle = normalize(title); - // Exact match after normalization - if (normalizedTitle.equals(normalizedQuery)) { + // ===== TIER 1: EXACT WORD MATCH (highest priority) ===== + if (isExactWordMatch(normalizedTitle, normalizedQuery)) { return 100; } - // Substring match (query is contained in title) - if (normalizedTitle.contains(normalizedQuery)) { - return 95; // Very high score but slightly less than exact + // ===== TIER 2: WORD-BOUNDARY SUBSTRING ===== + if (isWordBoundaryMatch(normalizedTitle, normalizedQuery)) { + return 95; } + // ===== TIER 3: PREFIX MATCH ===== + if (isPrefixMatch(normalizedTitle, normalizedQuery)) { + return 85; + } + + // ===== TIER 4: LEVENSHTEIN (typo tolerance) ===== int qlen = normalizedQuery.length(); int tlen = normalizedTitle.length(); - // Query longer than title - impossible match if (qlen > tlen) { return 0; } - // Find the best matching substring using Levenshtein distance int bestDistance = Integer.MAX_VALUE; - int bestPosition = -1; for (int i = 0; i <= tlen - qlen; i++) { String sub = normalizedTitle.substring(i, i + qlen); - int dist = LevenshteinDistance.calculate(normalizedQuery, sub); - + int dist = levenshteinDistance(normalizedQuery, sub); if (dist < bestDistance) { bestDistance = dist; - bestPosition = i; - - if (dist == 0) break; // Perfect match found, can't do better + if (dist == 0) break; } } - // Calculate score: 100% at distance 0, scales down with distance - // Normalize by query length for consistency - double similarity = 1.0 - (bestDistance / (double) qlen); - - // Apply position bonus: matches at the start are better - if (bestPosition == 0) { - similarity *= 1.1; // 10% boost for start matches + // Allow up to 2 edits (typo tolerance) + if (bestDistance <= 2) { + // Distance 0 = 80, Distance 1 = 70, Distance 2 = 60 + int score = 80 - (bestDistance * 10); + return Math.max(0, score); } - // Clamp to 0-100 - int score = (int) (similarity * 100.0); - return Math.max(0, Math.min(100, score)); + return 0; + } + + /** + * Exact word match: query must be surrounded by word boundaries or string edges + * "LOR" matches "L OR" or "LOR coffee" but NOT "LOREAL" + */ + private static boolean isExactWordMatch(String title, String query) { + String[] words = title.split("\\s+"); + for (String word : words) { + if (word.equals(query)) { + return true; + } + } + return false; + } + + /** + * Word boundary match: query matches at word start/end + * "LOR" matches in "L'OR" (after special char removed) + * "REAL" matches in "LOREAL" as word boundary? No, stays in Tier 4 + */ + private static boolean isWordBoundaryMatch(String title, String query) { + // Check if query appears after space or at start + if (title.startsWith(query + " ")) { + return true; + } + + if (title.contains(" " + query)) { + return true; + } + + // Check if query ends at word boundary + if (title.endsWith(" " + query)) { + return true; + } + + return false; + } + + /** + * Prefix match: query is the start of any word + * "CAF" matches in "CAFFE" or "CAFE LATTE" + */ + private static boolean isPrefixMatch(String title, String query) { + for (String word : title.split("\\s+")) { + if (word.startsWith(query) && word.length() > query.length()) { + return true; + } + } + return false; } private static String normalize(String input) { @@ -125,14 +170,12 @@ public class BonusManager { return input; } - // Unicode decomposition: separate base chars from diacritics + // Remove diacritics String decomposed = Normalizer.normalize(input, Normalizer.Form.NFD); - - // Remove all combining diacritical marks String withoutDiacritics = decomposed .replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); - // Lowercase and remove special characters (keep alphanumeric + spaces) + // Lowercase, remove special chars, normalize spaces String cleaned = withoutDiacritics .toLowerCase() .replaceAll("[^a-z0-9\\s]", "") @@ -141,4 +184,32 @@ public class BonusManager { return cleaned; } + + private static int levenshteinDistance(String query, String title) { + int qlen = query.length(); + int tlen = title.length(); + + if (qlen == 0) return tlen; + if (tlen == 0) return qlen; + + int[][] dp = new int[qlen + 1][tlen + 1]; + + for (int i = 0; i <= qlen; i++) dp[i][0] = i; + for (int j = 0; j <= tlen; j++) dp[0][j] = j; + + for (int i = 1; i <= qlen; i++) { + for (int j = 1; j <= tlen; j++) { + if (query.charAt(i - 1) == title.charAt(j - 1)) { + dp[i][j] = dp[i - 1][j - 1]; + } else { + dp[i][j] = 1 + Math.min( + Math.min(dp[i - 1][j - 1], dp[i - 1][j]), + dp[i][j - 1] + ); + } + } + } + + return dp[qlen][tlen]; + } } diff --git a/src/main/java/nl/herpiederpiee/appie_scraper/Main.java b/src/main/java/nl/herpiederpiee/appie_scraper/Main.java index e6e3fd0..1526763 100644 --- a/src/main/java/nl/herpiederpiee/appie_scraper/Main.java +++ b/src/main/java/nl/herpiederpiee/appie_scraper/Main.java @@ -1,13 +1,8 @@ package nl.herpiederpiee.appie_scraper; -import com.microsoft.playwright.*; -import org.json.*; import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; -import java.util.ArrayList; -import java.util.Scanner; - @SpringBootApplication public class Main {