NeoMutt  2025-12-11-694-ga89709
Teaching an old dog new tricks
DOXYGEN
Loading...
Searching...
No Matches
subseq.c
Go to the documentation of this file.
1
22
125
126#include "config.h"
127#include <stdbool.h>
128#include <string.h>
129#include "lib.h"
130#include "fuzzy.h"
131
133#define DEFAULT_MAX_PATTERN 256
134
144static inline unsigned char ascii_tolower(unsigned char c)
145{
146 if ((c >= 'A') && (c <= 'Z'))
147 return c + ('a' - 'A');
148 return c;
149}
150
160static inline int lower_if(int c, bool fold)
161{
162 return fold ? ascii_tolower((unsigned char) c) : c;
163}
164
171static inline bool utf8_is_continuation(unsigned char c)
172{
173 return (c & 0xC0) == 0x80;
174}
175
183static int utf8_char_len(const char *s)
184{
185 const unsigned char c0 = (unsigned char) s[0];
186
187 if (c0 < 0x80)
188 return 1;
189
190 if ((c0 >= 0xC2) && (c0 <= 0xDF))
191 {
192 if (s[1] && utf8_is_continuation((unsigned char) s[1]))
193 return 2;
194 return 1;
195 }
196
197 if ((c0 >= 0xE0) && (c0 <= 0xEF))
198 {
199 if (s[1] && s[2] && utf8_is_continuation((unsigned char) s[1]) &&
200 utf8_is_continuation((unsigned char) s[2]))
201 return 3;
202 return 1;
203 }
204
205 if ((c0 >= 0xF0) && (c0 <= 0xF4))
206 {
207 if (s[1] && s[2] && s[3] && utf8_is_continuation((unsigned char) s[1]) &&
208 utf8_is_continuation((unsigned char) s[2]) &&
209 utf8_is_continuation((unsigned char) s[3]))
210 return 4;
211 return 1;
212 }
213
214 return 1;
215}
216
227static bool compute_case_mode(const char *pattern, const struct FuzzyOptions *opts)
228{
229 if (!opts)
230 return true; // default case-insensitive
231
232 if (opts->case_sensitive)
233 return false;
234
235 if (opts->smart_case)
236 {
237 // Check if pattern contains any ASCII uppercase (A-Z)
238 for (const char *p = pattern; *p; p++)
239 {
240 unsigned char c = (unsigned char) *p;
241 if ((c >= 'A') && (c <= 'Z'))
242 return false; // found uppercase, use case-sensitive
243 }
244 }
245
246 return true; // fold case
247}
248
262int fuzzy_subseq_match(const char *pattern, const char *candidate,
263 const struct FuzzyOptions *opts, struct FuzzyResult *out)
264{
265 if (!pattern || !candidate)
266 return -1;
267
268 size_t plen = strlen(pattern);
269 if (plen == 0)
270 return -1;
271
272 int max_pattern = (opts && opts->max_pattern) ? opts->max_pattern : DEFAULT_MAX_PATTERN;
273 if ((max_pattern <= 0) || (max_pattern > DEFAULT_MAX_PATTERN))
274 max_pattern = DEFAULT_MAX_PATTERN;
275
276 if (plen > (size_t) max_pattern)
277 return -1;
278
279 bool fold = compute_case_mode(pattern, opts);
280
281 int matchpos[DEFAULT_MAX_PATTERN];
282
283 int pi = 0;
284 int ci = 0;
285 int score = 0;
286
287 int first = -1;
288 int last = -1;
289
290 // Forward subsequence scan
291 while (candidate[ci] && pi < (int) plen)
292 {
293 const unsigned char pbyte = (unsigned char) pattern[pi];
294 if (pbyte < 0x80)
295 {
296 int pc = lower_if(pattern[pi], fold);
297 int cc = lower_if(candidate[ci], fold);
298
299 if (pc == cc)
300 {
301 matchpos[pi] = ci;
302
303 if (first < 0)
304 first = ci;
305
306 last = ci;
307 pi++;
308 }
309
310 ci++;
311 continue;
312 }
313
314 // Non-ASCII pattern bytes must match as a whole UTF-8 codepoint.
315 const int pchar_len = utf8_char_len(pattern + pi);
316 const int premaining = (int) plen - pi;
317 if ((pchar_len <= 0) || (pchar_len > premaining))
318 return -1;
319 bool matched = false;
320
321 while (candidate[ci])
322 {
323 const unsigned char cbyte = (unsigned char) candidate[ci];
324 if (utf8_is_continuation(cbyte))
325 {
326 ci++;
327 continue;
328 }
329
330 const int cchar_len = utf8_char_len(candidate + ci);
331 if ((pchar_len == cchar_len) && (memcmp(pattern + pi, candidate + ci, pchar_len) == 0))
332 {
333 for (int k = 0; k < pchar_len; k++)
334 matchpos[pi + k] = ci + k;
335
336 if (first < 0)
337 first = ci;
338
339 last = ci + pchar_len - 1;
340 pi += pchar_len;
341 ci += cchar_len;
342 matched = true;
343 break;
344 }
345
346 ci += cchar_len;
347 }
348
349 if (!matched)
350 break;
351 }
352
353 if (pi != (int) plen)
354 return -1; // not a subsequence
355
356 // Scoring
357
358 // Base score
359 score += plen * 10;
360
361 // Consecutive & gap penalties
362 for (int i = 1; i < pi; i++)
363 {
364 int gap = matchpos[i] - matchpos[i - 1] - 1;
365
366 if (gap == 0)
367 score += 15; // compact match bonus
368 else
369 score -= gap * 2;
370 }
371
372 // Span penalty
373 int span = last - first + 1;
374 score -= span;
375
376 // Prefix bonus
377 if (first == 0 && opts && opts->prefer_prefix)
378 score += 40;
379
380 // Boundary bonus (ASCII-only separators and CamelCase)
381 for (int i = 0; i < pi; i++)
382 {
383 int pos = matchpos[i];
384
385 if (pos == 0)
386 score += 30; // start of string
387 else
388 {
389 unsigned char prev = (unsigned char) candidate[pos - 1];
390 unsigned char curr = (unsigned char) candidate[pos];
391
392 // ASCII separator boundaries
393 if ((prev == '/') || (prev == '.') || (prev == '-') || (prev == '_'))
394 score += 15;
395 // ASCII CamelCase boundary (lowercase followed by uppercase)
396 else if (((prev >= 'a') && (prev <= 'z')) && ((curr >= 'A') && (curr <= 'Z')))
397 score += 10;
398 }
399 }
400
401 // Mild length penalty
402 score -= (int) strlen(candidate) / 4;
403
404 // Ensure valid matches always return non-negative score
405 if (score < 0)
406 score = 0;
407
408 if (out)
409 {
410 out->score = score;
411 out->span = span;
412 out->start = first;
413 out->end = last;
414 }
415
416 return score;
417}
bool candidate(struct CompletionData *cd, char *user, const char *src, char *dest, size_t dlen)
Helper function for completion.
Definition helpers.c:79
Fuzzy matching library.
Fuzzy matching library - private definitions.
Options for fuzzy matching.
Definition lib.h:87
bool smart_case
Auto case-sensitive if pattern has uppercase.
Definition lib.h:89
bool case_sensitive
Match case exactly.
Definition lib.h:88
int max_pattern
Safety bound (<=0 = default 256, capped at 256)
Definition lib.h:91
bool prefer_prefix
Extra weight for prefix matches.
Definition lib.h:90
Result of a fuzzy match.
Definition lib.h:98
int score
Score (<0 = no match)
Definition lib.h:99
int span
Match span.
Definition lib.h:100
int start
First match position.
Definition lib.h:101
int end
Last match position.
Definition lib.h:102
int fuzzy_subseq_match(const char *pattern, const char *candidate, const struct FuzzyOptions *opts, struct FuzzyResult *out)
Perform subsequence fuzzy matching (UTF-8 aware, ASCII case-folding)
Definition subseq.c:262
static int utf8_char_len(const char *s)
Get length of a UTF-8 codepoint at a byte offset.
Definition subseq.c:183
static bool utf8_is_continuation(unsigned char c)
Check for UTF-8 continuation byte.
Definition subseq.c:171
static unsigned char ascii_tolower(unsigned char c)
Convert ASCII character to lowercase.
Definition subseq.c:144
static int lower_if(int c, bool fold)
Convert character to lowercase conditionally.
Definition subseq.c:160
static bool compute_case_mode(const char *pattern, const struct FuzzyOptions *opts)
Determine if case folding should be used.
Definition subseq.c:227
#define DEFAULT_MAX_PATTERN
Default maximum pattern length.
Definition subseq.c:133