NeoMutt  2025-12-11-694-ga89709
Teaching an old dog new tricks
DOXYGEN
Loading...
Searching...
No Matches
subseq.c File Reference

Subsequence fuzzy matching implementation. More...

#include "config.h"
#include <stdbool.h>
#include <string.h>
#include "lib.h"
#include "fuzzy.h"
+ Include dependency graph for subseq.c:

Go to the source code of this file.

Macros

#define DEFAULT_MAX_PATTERN   256
 Default maximum pattern length.
 

Functions

static unsigned char ascii_tolower (unsigned char c)
 Convert ASCII character to lowercase.
 
static int lower_if (int c, bool fold)
 Convert character to lowercase conditionally.
 
static bool utf8_is_continuation (unsigned char c)
 Check for UTF-8 continuation byte.
 
static int utf8_char_len (const char *s)
 Get length of a UTF-8 codepoint at a byte offset.
 
static bool compute_case_mode (const char *pattern, const struct FuzzyOptions *opts)
 Determine if case folding should be used.
 
int fuzzy_subseq_match (const char *pattern, const char *candidate, const struct FuzzyOptions *opts, struct FuzzyResult *out)
 Perform subsequence fuzzy matching (UTF-8 aware, ASCII case-folding)
 

Detailed Description

Subsequence fuzzy matching implementation.

Authors
  • Richard Russon

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file subseq.c.

Macro Definition Documentation

◆ DEFAULT_MAX_PATTERN

#define DEFAULT_MAX_PATTERN   256

Default maximum pattern length.

Definition at line 133 of file subseq.c.

Function Documentation

◆ ascii_tolower()

static unsigned char ascii_tolower ( unsigned char c)
inlinestatic

Convert ASCII character to lowercase.

Parameters
cCharacter to convert
Return values
numLowercase character if ASCII A-Z, otherwise unchanged

This function only converts ASCII characters (A-Z to a-z). All other bytes (including UTF-8 continuation bytes) are left unchanged. This preserves UTF-8 sequences while allowing case-insensitive ASCII matching.

Definition at line 144 of file subseq.c.

145{
146 if ((c >= 'A') && (c <= 'Z'))
147 return c + ('a' - 'A');
148 return c;
149}
+ Here is the caller graph for this function:

◆ lower_if()

static int lower_if ( int c,
bool fold )
inlinestatic

Convert character to lowercase conditionally.

Parameters
cCharacter to convert
foldIf true, convert ASCII to lowercase
Return values
numConverted character

Only performs ASCII case folding (A-Z to a-z). All other bytes are left unchanged, preserving UTF-8 sequences.

Definition at line 160 of file subseq.c.

161{
162 return fold ? ascii_tolower((unsigned char) c) : c;
163}
static unsigned char ascii_tolower(unsigned char c)
Convert ASCII character to lowercase.
Definition subseq.c:144
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ utf8_is_continuation()

static bool utf8_is_continuation ( unsigned char c)
inlinestatic

Check for UTF-8 continuation byte.

Parameters
cByte to check
Return values
trueByte is in range 0x80-0xBF
falseByte is not a continuation byte

Definition at line 171 of file subseq.c.

172{
173 return (c & 0xC0) == 0x80;
174}
+ Here is the caller graph for this function:

◆ utf8_char_len()

static int utf8_char_len ( const char * s)
static

Get length of a UTF-8 codepoint at a byte offset.

Parameters
sString position
Return values
numNumber of bytes in this codepoint (1-4)

Returns 1 for ASCII and malformed sequences.

Definition at line 183 of file subseq.c.

184{
185 const unsigned char c0 = (unsigned char) s[0];
186
187 if (c0 < 0x80)
188 return 1;
189
190 if ((c0 >= 0xC2) && (c0 <= 0xDF))
191 {
192 if (s[1] && utf8_is_continuation((unsigned char) s[1]))
193 return 2;
194 return 1;
195 }
196
197 if ((c0 >= 0xE0) && (c0 <= 0xEF))
198 {
199 if (s[1] && s[2] && utf8_is_continuation((unsigned char) s[1]) &&
200 utf8_is_continuation((unsigned char) s[2]))
201 return 3;
202 return 1;
203 }
204
205 if ((c0 >= 0xF0) && (c0 <= 0xF4))
206 {
207 if (s[1] && s[2] && s[3] && utf8_is_continuation((unsigned char) s[1]) &&
208 utf8_is_continuation((unsigned char) s[2]) &&
209 utf8_is_continuation((unsigned char) s[3]))
210 return 4;
211 return 1;
212 }
213
214 return 1;
215}
static bool utf8_is_continuation(unsigned char c)
Check for UTF-8 continuation byte.
Definition subseq.c:171
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ compute_case_mode()

static bool compute_case_mode ( const char * pattern,
const struct FuzzyOptions * opts )
static

Determine if case folding should be used.

Parameters
patternPattern string
optsFuzzy matching options
Return values
trueUse case-insensitive matching (ASCII only)
falseUse case-sensitive matching

Smart case only examines ASCII characters (A-Z). Non-ASCII bytes are ignored for smart case detection.

Definition at line 227 of file subseq.c.

228{
229 if (!opts)
230 return true; // default case-insensitive
231
232 if (opts->case_sensitive)
233 return false;
234
235 if (opts->smart_case)
236 {
237 // Check if pattern contains any ASCII uppercase (A-Z)
238 for (const char *p = pattern; *p; p++)
239 {
240 unsigned char c = (unsigned char) *p;
241 if ((c >= 'A') && (c <= 'Z'))
242 return false; // found uppercase, use case-sensitive
243 }
244 }
245
246 return true; // fold case
247}
bool smart_case
Auto case-sensitive if pattern has uppercase.
Definition lib.h:89
bool case_sensitive
Match case exactly.
Definition lib.h:88
+ Here is the caller graph for this function:

◆ fuzzy_subseq_match()

int fuzzy_subseq_match ( const char * pattern,
const char * candidate,
const struct FuzzyOptions * opts,
struct FuzzyResult * out )

Perform subsequence fuzzy matching (UTF-8 aware, ASCII case-folding)

Parameters
patternPattern to match (UTF-8 byte sequence)
candidateCandidate string to match against (UTF-8 byte sequence)
optsFuzzy matching options
outOutput result structure
Return values
>=0Match score
-1Error or no match

Performs byte-wise subsequence matching with ASCII-only case folding. UTF-8 multi-byte sequences are preserved but matched as raw bytes. Only ASCII A-Z characters are case-folded to a-z.

Definition at line 262 of file subseq.c.

264{
265 if (!pattern || !candidate)
266 return -1;
267
268 size_t plen = strlen(pattern);
269 if (plen == 0)
270 return -1;
271
272 int max_pattern = (opts && opts->max_pattern) ? opts->max_pattern : DEFAULT_MAX_PATTERN;
273 if ((max_pattern <= 0) || (max_pattern > DEFAULT_MAX_PATTERN))
274 max_pattern = DEFAULT_MAX_PATTERN;
275
276 if (plen > (size_t) max_pattern)
277 return -1;
278
279 bool fold = compute_case_mode(pattern, opts);
280
281 int matchpos[DEFAULT_MAX_PATTERN];
282
283 int pi = 0;
284 int ci = 0;
285 int score = 0;
286
287 int first = -1;
288 int last = -1;
289
290 // Forward subsequence scan
291 while (candidate[ci] && pi < (int) plen)
292 {
293 const unsigned char pbyte = (unsigned char) pattern[pi];
294 if (pbyte < 0x80)
295 {
296 int pc = lower_if(pattern[pi], fold);
297 int cc = lower_if(candidate[ci], fold);
298
299 if (pc == cc)
300 {
301 matchpos[pi] = ci;
302
303 if (first < 0)
304 first = ci;
305
306 last = ci;
307 pi++;
308 }
309
310 ci++;
311 continue;
312 }
313
314 // Non-ASCII pattern bytes must match as a whole UTF-8 codepoint.
315 const int pchar_len = utf8_char_len(pattern + pi);
316 const int premaining = (int) plen - pi;
317 if ((pchar_len <= 0) || (pchar_len > premaining))
318 return -1;
319 bool matched = false;
320
321 while (candidate[ci])
322 {
323 const unsigned char cbyte = (unsigned char) candidate[ci];
324 if (utf8_is_continuation(cbyte))
325 {
326 ci++;
327 continue;
328 }
329
330 const int cchar_len = utf8_char_len(candidate + ci);
331 if ((pchar_len == cchar_len) && (memcmp(pattern + pi, candidate + ci, pchar_len) == 0))
332 {
333 for (int k = 0; k < pchar_len; k++)
334 matchpos[pi + k] = ci + k;
335
336 if (first < 0)
337 first = ci;
338
339 last = ci + pchar_len - 1;
340 pi += pchar_len;
341 ci += cchar_len;
342 matched = true;
343 break;
344 }
345
346 ci += cchar_len;
347 }
348
349 if (!matched)
350 break;
351 }
352
353 if (pi != (int) plen)
354 return -1; // not a subsequence
355
356 // Scoring
357
358 // Base score
359 score += plen * 10;
360
361 // Consecutive & gap penalties
362 for (int i = 1; i < pi; i++)
363 {
364 int gap = matchpos[i] - matchpos[i - 1] - 1;
365
366 if (gap == 0)
367 score += 15; // compact match bonus
368 else
369 score -= gap * 2;
370 }
371
372 // Span penalty
373 int span = last - first + 1;
374 score -= span;
375
376 // Prefix bonus
377 if (first == 0 && opts && opts->prefer_prefix)
378 score += 40;
379
380 // Boundary bonus (ASCII-only separators and CamelCase)
381 for (int i = 0; i < pi; i++)
382 {
383 int pos = matchpos[i];
384
385 if (pos == 0)
386 score += 30; // start of string
387 else
388 {
389 unsigned char prev = (unsigned char) candidate[pos - 1];
390 unsigned char curr = (unsigned char) candidate[pos];
391
392 // ASCII separator boundaries
393 if ((prev == '/') || (prev == '.') || (prev == '-') || (prev == '_'))
394 score += 15;
395 // ASCII CamelCase boundary (lowercase followed by uppercase)
396 else if (((prev >= 'a') && (prev <= 'z')) && ((curr >= 'A') && (curr <= 'Z')))
397 score += 10;
398 }
399 }
400
401 // Mild length penalty
402 score -= (int) strlen(candidate) / 4;
403
404 // Ensure valid matches always return non-negative score
405 if (score < 0)
406 score = 0;
407
408 if (out)
409 {
410 out->score = score;
411 out->span = span;
412 out->start = first;
413 out->end = last;
414 }
415
416 return score;
417}
bool candidate(struct CompletionData *cd, char *user, const char *src, char *dest, size_t dlen)
Helper function for completion.
Definition helpers.c:79
int max_pattern
Safety bound (<=0 = default 256, capped at 256)
Definition lib.h:91
bool prefer_prefix
Extra weight for prefix matches.
Definition lib.h:90
int score
Score (<0 = no match)
Definition lib.h:99
int span
Match span.
Definition lib.h:100
int start
First match position.
Definition lib.h:101
int end
Last match position.
Definition lib.h:102
static int utf8_char_len(const char *s)
Get length of a UTF-8 codepoint at a byte offset.
Definition subseq.c:183
static int lower_if(int c, bool fold)
Convert character to lowercase conditionally.
Definition subseq.c:160
static bool compute_case_mode(const char *pattern, const struct FuzzyOptions *opts)
Determine if case folding should be used.
Definition subseq.c:227
#define DEFAULT_MAX_PATTERN
Default maximum pattern length.
Definition subseq.c:133
+ Here is the call graph for this function:
+ Here is the caller graph for this function: