NeoMutt  2025-12-11-694-ga89709
Teaching an old dog new tricks
DOXYGEN
Loading...
Searching...
No Matches
lib.h File Reference

Conversion between different character encodings. More...

#include <stdio.h>
+ Include dependency graph for lib.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Functions

size_t mutt_convert_file_from_to (FILE *fp, const struct Slist *fromcodes, const struct Slist *tocodes, char **fromcode, char **tocode, struct Content *info)
 Convert a file between encodings.
 
size_t mutt_convert_file_to (FILE *fp, const char *fromcode, struct Slist const *const tocodes, int *tocode, struct Content *info)
 Change the encoding of a file.
 
struct Contentmutt_get_content_info (const char *fname, struct Body *b, struct ConfigSubset *sub)
 Analyze file to determine MIME encoding to use.
 
void mutt_update_content_info (struct Content *info, struct ContentState *s, char *buf, size_t buflen)
 Cache some info about an email.
 

Detailed Description

Conversion between different character encodings.

Authors
  • Michal Siedlaczek
  • Richard Russon

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file lib.h.

Function Documentation

◆ mutt_convert_file_from_to()

size_t mutt_convert_file_from_to ( FILE * fp,
const struct Slist * fromcodes,
const struct Slist * tocodes,
char ** fromcode,
char ** tocode,
struct Content * info )

Convert a file between encodings.

Parameters
[in]fpFile to read from
[in]fromcodesCharsets to try converting FROM
[in]tocodesCharsets to try converting TO
[out]fromcodeFrom charset selected
[out]tocodeTo charset selected
[out]infoInfo about the file
Return values
numCharacters converted
ICONV_ILLEGAL_SEQError (as a size_t)

Find the first of the fromcodes that gives a valid conversion and the best charset conversion of the file into one of the tocodes. If successful, set *fromcode and *tocode to dynamically allocated strings, set Content *info, and return the number of characters converted inexactly. If no conversion was possible, return -1.

Definition at line 215 of file convert.c.

218{
219 char **tcode = NULL;
220 size_t rc;
221 int cn;
222 struct ListNode *np = NULL;
223
224 /* Copy them */
225 tcode = MUTT_MEM_CALLOC(tocodes->count, char *);
226 np = NULL;
227 cn = 0;
228 STAILQ_FOREACH(np, &tocodes->head, entries)
229 {
230 tcode[cn++] = mutt_str_dup(np->data);
231 }
232
234 np = NULL;
235 cn = 0;
236 STAILQ_FOREACH(np, &fromcodes->head, entries)
237 {
238 /* Try each fromcode in turn */
239 rc = mutt_convert_file_to(fp, np->data, tocodes, &cn, info);
240 if (rc != ICONV_ILLEGAL_SEQ)
241 {
242 *fromcode = mutt_str_dup(np->data);
243 *tocode = tcode[cn];
244 tcode[cn] = 0;
245 break;
246 }
247 }
248
249 /* Free memory */
250 for (cn = 0; cn < tocodes->count; cn++)
251 FREE(&tcode[cn]);
252
253 FREE(&tcode);
254
255 return rc;
256}
size_t mutt_convert_file_to(FILE *fp, const char *fromcode, struct Slist const *const tocodes, int *tocode, struct Content *info)
Change the encoding of a file.
Definition convert.c:64
#define FREE(x)
Free memory and set the pointer to NULL.
Definition memory.h:68
#define MUTT_MEM_CALLOC(n, type)
Definition memory.h:52
#define ICONV_ILLEGAL_SEQ
Error value for iconv() - Illegal sequence.
Definition charset.h:114
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition string.c:257
#define STAILQ_FOREACH(var, head, field)
Definition queue.h:390
A List node for strings.
Definition list.h:37
char * data
String.
Definition list.h:38
struct ListHead head
List containing values.
Definition slist.h:38
size_t count
Number of values in list.
Definition slist.h:39
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_convert_file_to()

size_t mutt_convert_file_to ( FILE * fp,
const char * fromcode,
struct Slist const *const tocodes,
int * tocode,
struct Content * info )

Change the encoding of a file.

Parameters
[in]fpFile to convert
[in]fromcodeOriginal encoding
[in]tocodesList of target encodings
[out]tocodeChosen encoding
[out]infoEncoding information
Return values
-1Error, no conversion was possible
>0Success, number of bytes converted

Find the best charset conversion of the file from fromcode into one of the tocodes. If successful, set *tocode and Content *info and return the number of characters converted inexactly.

We convert via UTF-8 in order to avoid the condition -1(EINVAL), which would otherwise prevent us from knowing the number of inexact conversions. Where the candidate target charset is UTF-8 we avoid doing the second conversion because iconv_open("UTF-8", "UTF-8") fails with some libraries.

We assume that the output from iconv is never more than 4 times as long as the input for any pair of charsets we might be interested in.

Definition at line 64 of file convert.c.

66{
67 char bufi[256] = { 0 };
68 char bufu[512] = { 0 };
69 char bufo[4 * sizeof(bufi)] = { 0 };
70 size_t rc = ICONV_ILLEGAL_SEQ;
71
72 const iconv_t cd1 = mutt_ch_iconv_open("utf-8", fromcode, MUTT_ICONV_NO_FLAGS);
73 if (!iconv_t_valid(cd1))
74 return -1;
75
76 int ncodes = tocodes->count;
77 iconv_t *cd = MUTT_MEM_CALLOC(ncodes, iconv_t);
78 size_t *score = MUTT_MEM_CALLOC(ncodes, size_t);
79 struct ContentState *states = MUTT_MEM_CALLOC(ncodes, struct ContentState);
80 struct Content *infos = MUTT_MEM_CALLOC(ncodes, struct Content);
81
82 struct ListNode *np = NULL;
83 int ni = 0;
84 STAILQ_FOREACH(np, &tocodes->head, entries)
85 {
86 if (!mutt_istr_equal(np->data, "utf-8"))
87 {
88 cd[ni] = mutt_ch_iconv_open(np->data, "utf-8", MUTT_ICONV_NO_FLAGS);
89 }
90 else
91 {
92 /* Special case for conversion to UTF-8 */
93 cd[ni] = ICONV_T_INVALID;
94 score[ni] = ICONV_ILLEGAL_SEQ;
95 }
96 ni += 1;
97 }
98
99 rewind(fp);
100 size_t ibl = 0;
101 while (true)
102 {
103 /* Try to fill input buffer */
104 size_t n = fread(bufi + ibl, 1, sizeof(bufi) - ibl, fp);
105 ibl += n;
106
107 /* Convert to UTF-8 */
108 const char *ib = bufi;
109 char *ob = bufu;
110 size_t obl = sizeof(bufu);
111 n = iconv(cd1, (ICONV_CONST char **) ((ibl != 0) ? &ib : 0), &ibl, &ob, &obl);
112 if ((n == ICONV_ILLEGAL_SEQ) && (((errno != EINVAL) && (errno != E2BIG)) || (ib == bufi)))
113 {
115 break;
116 }
117 const size_t ubl1 = ob - bufu;
118
119 /* Convert from UTF-8 */
120 for (int i = 0; i < ncodes; i++)
121 {
122 if (iconv_t_valid(cd[i]) && (score[i] != ICONV_ILLEGAL_SEQ))
123 {
124 const char *ub = bufu;
125 size_t ubl = ubl1;
126 ob = bufo;
127 obl = sizeof(bufo);
128 n = iconv(cd[i], (ICONV_CONST char **) ((ibl || ubl) ? &ub : 0), &ubl, &ob, &obl);
129 if (n == ICONV_ILLEGAL_SEQ)
130 {
131 score[i] = ICONV_ILLEGAL_SEQ;
132 }
133 else
134 {
135 score[i] += n;
136 mutt_update_content_info(&infos[i], &states[i], bufo, ob - bufo);
137 }
138 }
139 else if (!iconv_t_valid(cd[i]) && (score[i] == ICONV_ILLEGAL_SEQ))
140 {
141 /* Special case for conversion to UTF-8 */
142 mutt_update_content_info(&infos[i], &states[i], bufu, ubl1);
143 }
144 }
145
146 if (ibl)
147 {
148 /* Save unused input */
149 memmove(bufi, ib, ibl);
150 }
151 else if (!ubl1 && (ib < bufi + sizeof(bufi)))
152 {
153 rc = 0;
154 break;
155 }
156 }
157
158 if (rc == 0)
159 {
160 /* Find best score */
162 for (int i = 0; i < ncodes; i++)
163 {
164 if (!iconv_t_valid(cd[i]) && (score[i] == ICONV_ILLEGAL_SEQ))
165 {
166 /* Special case for conversion to UTF-8 */
167 *tocode = i;
168 rc = 0;
169 break;
170 }
171 else if (!iconv_t_valid(cd[i]) || (score[i] == ICONV_ILLEGAL_SEQ))
172 {
173 continue;
174 }
175 else if ((rc == ICONV_ILLEGAL_SEQ) || (score[i] < rc))
176 {
177 *tocode = i;
178 rc = score[i];
179 if (rc == 0)
180 break;
181 }
182 }
183 if (rc != ICONV_ILLEGAL_SEQ)
184 {
185 memcpy(info, &infos[*tocode], sizeof(struct Content));
186 mutt_update_content_info(info, &states[*tocode], 0, 0); /* EOF */
187 }
188 }
189
190 FREE(&cd);
191 FREE(&infos);
192 FREE(&score);
193 FREE(&states);
194
195 return rc;
196}
void mutt_update_content_info(struct Content *info, struct ContentState *s, char *buf, size_t buflen)
Cache some info about an email.
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition charset.c:580
#define ICONV_T_INVALID
Error value for iconv functions.
Definition charset.h:111
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition charset.h:66
static bool iconv_t_valid(const iconv_t cd)
Is the conversion descriptor valid?
Definition charset.h:123
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition string.c:677
Info about the body of an email.
Definition content.h:56
Info about an attachment.
Definition content.h:35
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_get_content_info()

struct Content * mutt_get_content_info ( const char * fname,
struct Body * b,
struct ConfigSubset * sub )

Analyze file to determine MIME encoding to use.

Parameters
fnameFile to examine
bBody to update
subConfig Subset
Return values
ptrNewly allocated Content

Also set the body charset, sometimes, or not.

Definition at line 197 of file content_info.c.

199{
200 struct Content *info = NULL;
201 struct ContentState cstate = { 0 };
202 FILE *fp = NULL;
203 char *fromcode = NULL;
204 char *tocode = NULL;
205 char buf[100] = { 0 };
206 size_t r;
207
208 struct stat st = { 0 };
209
210 if (b && !fname)
211 fname = b->filename;
212 if (!fname)
213 return NULL;
214
215 fp = mutt_file_fopen(fname, "r");
216 if (!fp)
217 {
218 mutt_debug(LL_DEBUG1, "%s: %s (errno %d)\n", fname, strerror(errno), errno);
219 return NULL;
220 }
221
222 if (fstat(fileno(fp), &st) == -1)
223 {
224 mutt_error(_("Can't stat %s: %s"), fname, strerror(errno));
225 mutt_file_fclose(&fp);
226 return NULL;
227 }
228
229 if (!S_ISREG(st.st_mode))
230 {
231 mutt_error(_("%s isn't a regular file"), fname);
232 mutt_file_fclose(&fp);
233 return NULL;
234 }
235
236 info = MUTT_MEM_CALLOC(1, struct Content);
237
238 const char *const c_charset = cc_charset();
239 if (b && (b->type == TYPE_TEXT) && (!b->noconv && !b->force_charset))
240 {
241 const struct Slist *const c_attach_charset = cs_subset_slist(sub, "attach_charset");
242 const struct Slist *const c_send_charset = cs_subset_slist(sub, "send_charset");
243 struct Slist *c_charset_slist = slist_parse(c_charset, D_SLIST_SEP_COLON);
244
245 const struct Slist *fchs = b->use_disp ?
246 (c_attach_charset ? c_attach_charset : c_charset_slist) :
247 c_charset_slist;
248
249 struct Slist *chs = slist_parse(mutt_param_get(&b->parameter, "charset"), D_SLIST_SEP_COLON);
250
251 if (c_charset && (chs || c_send_charset) &&
252 (mutt_convert_file_from_to(fp, fchs, chs ? chs : c_send_charset, &fromcode,
253 &tocode, info) != ICONV_ILLEGAL_SEQ))
254 {
255 if (!chs)
256 {
257 char chsbuf[256] = { 0 };
258 mutt_ch_canonical_charset(chsbuf, sizeof(chsbuf), tocode);
259 mutt_param_set(&b->parameter, "charset", chsbuf);
260 }
261 FREE(&b->charset);
262 b->charset = fromcode;
263 fromcode = NULL;
264 FREE(&tocode);
265 mutt_file_fclose(&fp);
266 slist_free(&c_charset_slist);
267 slist_free(&chs);
268 return info;
269 }
270
271 slist_free(&c_charset_slist);
272 slist_free(&chs);
273 }
274
275 rewind(fp);
276 while ((r = fread(buf, 1, sizeof(buf), fp)))
277 mutt_update_content_info(info, &cstate, buf, r);
278 mutt_update_content_info(info, &cstate, 0, 0);
279
280 mutt_file_fclose(&fp);
281
282 if (b && (b->type == TYPE_TEXT) && (!b->noconv && !b->force_charset))
283 {
284 mutt_param_set(&b->parameter, "charset",
285 (!info->hibin ? "us-ascii" :
286 c_charset && !mutt_ch_is_us_ascii(c_charset) ? c_charset :
287 "unknown-8bit"));
288 }
289
290 return info;
291}
const struct Slist * cs_subset_slist(const struct ConfigSubset *sub, const char *name)
Get a string-list config item by name.
Definition helpers.c:242
const char * cc_charset(void)
Get the cached value of $charset.
size_t mutt_convert_file_from_to(FILE *fp, const struct Slist *fromcodes, const struct Slist *tocodes, char **fromcode, char **tocode, struct Content *info)
Convert a file between encodings.
Definition convert.c:215
#define mutt_file_fclose(FP)
Definition file.h:139
#define mutt_file_fopen(PATH, MODE)
Definition file.h:138
#define mutt_error(...)
Definition logging2.h:94
#define mutt_debug(LEVEL,...)
Definition logging2.h:91
@ LL_DEBUG1
Log at debug level 1.
Definition logging2.h:45
@ TYPE_TEXT
Type: 'text/*'.
Definition mime.h:38
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition charset.c:360
#define mutt_ch_is_us_ascii(str)
Definition charset.h:108
#define _(a)
Definition message.h:28
struct Slist * slist_parse(const char *str, uint32_t flags)
Parse a list of strings into a list.
Definition slist.c:177
void slist_free(struct Slist **ptr)
Free an Slist object.
Definition slist.c:124
char * mutt_param_get(const struct ParameterList *pl, const char *s)
Find a matching Parameter.
Definition parameter.c:85
void mutt_param_set(struct ParameterList *pl, const char *attribute, const char *value)
Set a Parameter.
Definition parameter.c:111
bool noconv
Don't do character set conversion.
Definition body.h:46
char * charset
Send mode: charset of attached file as stored on disk.
Definition body.h:79
struct ParameterList parameter
Parameters of the content-type.
Definition body.h:63
bool use_disp
Content-Disposition uses filename= ?
Definition body.h:47
bool force_charset
Send mode: don't adjust the character set when in send-mode.
Definition body.h:44
unsigned int type
content-type primary type, ContentType
Definition body.h:40
char * filename
When sending a message, this is the file to which this structure refers.
Definition body.h:59
long hibin
8-bit characters
Definition content.h:36
String list.
Definition slist.h:37
#define D_SLIST_SEP_COLON
Slist items are colon-separated.
Definition types.h:112
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_update_content_info()

void mutt_update_content_info ( struct Content * info,
struct ContentState * s,
char * buf,
size_t buflen )

Cache some info about an email.

Parameters
infoInfo about an Attachment
sInfo about the Body of an email
bufBuffer for the result
buflenLength of the buffer

Definition at line 49 of file content_info.c.

51{
52 bool from = s->from;
53 int whitespace = s->whitespace;
54 bool dot = s->dot;
55 int linelen = s->linelen;
56 bool was_cr = s->was_cr;
57
58 /* A NULL buffer signals end-of-file; finalize any pending state */
59 if (!buf) /* This signals EOF */
60 {
61 if (was_cr)
62 info->binary = true;
63 if (linelen > info->linemax)
64 info->linemax = linelen;
65
66 return;
67 }
68
69 /* Scan each byte in the buffer and classify it for MIME encoding decisions */
70 for (; buflen; buf++, buflen--)
71 {
72 char ch = *buf;
73
74 /* A CR not followed by LF indicates binary content */
75 if (was_cr)
76 {
77 was_cr = false;
78 if (ch == '\n')
79 {
80 /* CR+LF: complete line ending; record line statistics */
81 if (whitespace)
82 info->space = true;
83 if (dot)
84 info->dot = true;
85 if (linelen > info->linemax)
86 info->linemax = linelen;
87 whitespace = 0;
88 dot = false;
89 linelen = 0;
90 continue;
91 }
92
93 info->binary = true;
94 }
95
96 linelen++;
97 if (ch == '\n')
98 {
99 /* Bare LF line ending */
100 info->crlf++;
101 if (whitespace)
102 info->space = true;
103 if (dot)
104 info->dot = true;
105 if (linelen > info->linemax)
106 info->linemax = linelen;
107 whitespace = 0;
108 linelen = 0;
109 dot = false;
110 }
111 else if (ch == '\r')
112 {
113 info->crlf++;
114 info->cr = true;
115 was_cr = true;
116 continue;
117 }
118 else if (ch & 0x80)
119 {
120 /* High-bit character: needs 8-bit or Base64 encoding */
121 info->hibin++;
122 }
123 else if ((ch == '\t') || (ch == '\f'))
124 {
125 info->ascii++;
126 whitespace++;
127 }
128 else if (ch == 0)
129 {
130 /* NUL byte: forces binary encoding */
131 info->nulbin++;
132 info->lobin++;
133 }
134 else if ((ch < 32) || (ch == 127))
135 {
136 info->lobin++;
137 }
138 else
139 {
140 /* Detect "From " at the start of a line (mbox from-quoting) */
141 if (linelen == 1)
142 {
143 if ((ch == 'F') || (ch == 'f'))
144 from = true;
145 else
146 from = false;
147 if (ch == '.')
148 dot = true;
149 else
150 dot = false;
151 }
152 else if (from)
153 {
154 /* Check chars 2-4 for "rom" to complete "From" detection */
155 if ((linelen == 2) && (ch != 'r'))
156 {
157 from = false;
158 }
159 else if ((linelen == 3) && (ch != 'o'))
160 {
161 from = false;
162 }
163 else if (linelen == 4)
164 {
165 if (ch == 'm')
166 info->from = true;
167 from = false;
168 }
169 }
170 if (ch == ' ')
171 whitespace++;
172 info->ascii++;
173 }
174
175 if (linelen > 1)
176 dot = false;
177 if ((ch != ' ') && (ch != '\t'))
178 whitespace = 0;
179 }
180
181 s->from = from;
182 s->whitespace = whitespace;
183 s->dot = dot;
184 s->linelen = linelen;
185 s->was_cr = was_cr;
186}
bool was_cr
Was the last character CR?
Definition content.h:61
int whitespace
Number of trailing whitespaces.
Definition content.h:58
bool from
Is the current line a prefix of "From "?
Definition content.h:57
int linelen
Length of the current line.
Definition content.h:60
bool dot
Was the last character a dot?
Definition content.h:59
long crlf
\r and \n characters
Definition content.h:39
bool cr
Has CR, even when in a CRLF pair.
Definition content.h:46
bool space
Whitespace at the end of lines?
Definition content.h:42
long ascii
Number of ascii chars.
Definition content.h:40
bool binary
Long lines, or CR not in CRLF pair.
Definition content.h:43
bool from
Has a line beginning with "From "?
Definition content.h:44
long nulbin
Null characters (0x0)
Definition content.h:38
long linemax
Length of the longest line in the file.
Definition content.h:41
long lobin
Unprintable 7-bit chars (eg., control chars)
Definition content.h:37
bool dot
Has a line consisting of a single dot?
Definition content.h:45
+ Here is the caller graph for this function: