Conversion between different character encodings. More...

#include <stdio.h>

Include dependency graph for lib.h:

This graph shows which files directly or indirectly include this file:

Functions
size_t	mutt_convert_file_from_to (FILE fp, const struct Slist fromcodes, const struct Slist tocodes, char fromcode, char tocode, struct Content info)
	Convert a file between encodings.

size_t	mutt_convert_file_to (FILE fp, const char fromcode, struct Slist const const tocodes, int tocode, struct Content *info)
	Change the encoding of a file.

struct Content *	mutt_get_content_info (const char fname, struct Body b, struct ConfigSubset *sub)
	Analyze file to determine MIME encoding to use.

void	mutt_update_content_info (struct Content info, struct ContentState s, char *buf, size_t buflen)
	Cache some info about an email.

Detailed Description

Conversion between different character encodings.

Authors

Michal Siedlaczek
Richard Russon

Copyright: This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file lib.h.

Function Documentation

◆ mutt_convert_file_from_to()

size_t mutt_convert_file_from_to	(	FILE *	fp,
		const struct Slist *	fromcodes,
		const struct Slist *	tocodes,
		char **	fromcode,
		char **	tocode,
		struct Content *	info )

Convert a file between encodings.

Parameters

[in]	fp	File to read from
[in]	fromcodes	Charsets to try converting FROM
[in]	tocodes	Charsets to try converting TO
[out]	fromcode	From charset selected
[out]	tocode	To charset selected
[out]	info	Info about the file

Return values

num	Characters converted
ICONV_ILLEGAL_SEQ	Error (as a size_t)

Find the first of the fromcodes that gives a valid conversion and the best charset conversion of the file into one of the tocodes. If successful, set *fromcode and *tocode to dynamically allocated strings, set Content *info, and return the number of characters converted inexactly. If no conversion was possible, return -1.

Definition at line 215 of file convert.c.

{
  char **tcode = NULL;
  size_t rc;
  int cn;
  struct ListNode *np = NULL;
 
  /* Copy them */
  tcode = MUTT_MEM_CALLOC(tocodes->count, char *);
  np = NULL;
  cn = 0;
  STAILQ_FOREACH(np, &tocodes->head, entries)
  {
    tcode[cn++] = mutt_str_dup(np->data);
  }
 
  rc = ICONV_ILLEGAL_SEQ;
  np = NULL;
  cn = 0;
  STAILQ_FOREACH(np, &fromcodes->head, entries)
  {
    /* Try each fromcode in turn */
    rc = mutt_convert_file_to(fp, np->data, tocodes, &cn, info);
    if (rc != ICONV_ILLEGAL_SEQ)
    {
      *fromcode = np->data;
      *tocode = tcode[cn];
      tcode[cn] = 0;
      break;
    }
  }
 
  /* Free memory */
  for (cn = 0; cn < tocodes->count; cn++)
    FREE(&tcode[cn]);
 
  FREE(&tcode);
 
  return rc;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_convert_file_to()

size_t mutt_convert_file_to	(	FILE *	fp,
		const char *	fromcode,
		struct Slist const *const	tocodes,
		int *	tocode,
		struct Content *	info )

Change the encoding of a file.

Parameters

[in]	fp	File to convert
[in]	fromcode	Original encoding
[in]	tocodes	List of target encodings
[out]	tocode	Chosen encoding
[out]	info	Encoding information

Return values

-1	Error, no conversion was possible
>0	Success, number of bytes converted

Find the best charset conversion of the file from fromcode into one of the tocodes. If successful, set *tocode and Content *info and return the number of characters converted inexactly.

We convert via UTF-8 in order to avoid the condition -1(EINVAL), which would otherwise prevent us from knowing the number of inexact conversions. Where the candidate target charset is UTF-8 we avoid doing the second conversion because iconv_open("UTF-8", "UTF-8") fails with some libraries.

We assume that the output from iconv is never more than 4 times as long as the input for any pair of charsets we might be interested in.

Definition at line 64 of file convert.c.

{
  char bufi[256] = { 0 };
  char bufu[512] = { 0 };
  char bufo[4 * sizeof(bufi)] = { 0 };
  size_t rc;
 
  const iconv_t cd1 = mutt_ch_iconv_open("utf-8", fromcode, MUTT_ICONV_NO_FLAGS);
  if (!iconv_t_valid(cd1))
    return -1;
 
  int ncodes = tocodes->count;
  iconv_t *cd = MUTT_MEM_CALLOC(ncodes, iconv_t);
  size_t *score = MUTT_MEM_CALLOC(ncodes, size_t);
  struct ContentState *states = MUTT_MEM_CALLOC(ncodes, struct ContentState);
  struct Content *infos = MUTT_MEM_CALLOC(ncodes, struct Content);
 
  struct ListNode *np = NULL;
  int ni = 0;
  STAILQ_FOREACH(np, &tocodes->head, entries)
  {
    if (!mutt_istr_equal(np->data, "utf-8"))
    {
      cd[ni] = mutt_ch_iconv_open(np->data, "utf-8", MUTT_ICONV_NO_FLAGS);
    }
    else
    {
      /* Special case for conversion to UTF-8 */
      cd[ni] = ICONV_T_INVALID;
      score[ni] = ICONV_ILLEGAL_SEQ;
    }
    ni += 1;
  }
 
  rewind(fp);
  size_t ibl = 0;
  while (true)
  {
    /* Try to fill input buffer */
    size_t n = fread(bufi + ibl, 1, sizeof(bufi) - ibl, fp);
    ibl += n;
 
    /* Convert to UTF-8 */
    const char *ib = bufi;
    char *ob = bufu;
    size_t obl = sizeof(bufu);
    n = iconv(cd1, (ICONV_CONST char **) ((ibl != 0) ? &ib : 0), &ibl, &ob, &obl);
    if ((n == ICONV_ILLEGAL_SEQ) && (((errno != EINVAL) && (errno != E2BIG)) || (ib == bufi)))
    {
      rc = ICONV_ILLEGAL_SEQ;
      break;
    }
    const size_t ubl1 = ob - bufu;
 
    /* Convert from UTF-8 */
    for (int i = 0; i < ncodes; i++)
    {
      if (iconv_t_valid(cd[i]) && (score[i] != ICONV_ILLEGAL_SEQ))
      {
        const char *ub = bufu;
        size_t ubl = ubl1;
        ob = bufo;
        obl = sizeof(bufo);
        n = iconv(cd[i], (ICONV_CONST char **) ((ibl || ubl) ? &ub : 0), &ubl, &ob, &obl);
        if (n == ICONV_ILLEGAL_SEQ)
        {
          score[i] = ICONV_ILLEGAL_SEQ;
        }
        else
        {
          score[i] += n;
          mutt_update_content_info(&infos[i], &states[i], bufo, ob - bufo);
        }
      }
      else if (!iconv_t_valid(cd[i]) && (score[i] == ICONV_ILLEGAL_SEQ))
      {
        /* Special case for conversion to UTF-8 */
        mutt_update_content_info(&infos[i], &states[i], bufu, ubl1);
      }
    }
 
    if (ibl)
    {
      /* Save unused input */
      memmove(bufi, ib, ibl);
    }
    else if (!ubl1 && (ib < bufi + sizeof(bufi)))
    {
      rc = 0;
      break;
    }
  }
 
  if (rc == 0)
  {
    /* Find best score */
    rc = ICONV_ILLEGAL_SEQ;
    for (int i = 0; i < ncodes; i++)
    {
      if (!iconv_t_valid(cd[i]) && (score[i] == ICONV_ILLEGAL_SEQ))
      {
        /* Special case for conversion to UTF-8 */
        *tocode = i;
        rc = 0;
        break;
      }
      else if (!iconv_t_valid(cd[i]) || (score[i] == ICONV_ILLEGAL_SEQ))
      {
        continue;
      }
      else if ((rc == ICONV_ILLEGAL_SEQ) || (score[i] < rc))
      {
        *tocode = i;
        rc = score[i];
        if (rc == 0)
          break;
      }
    }
    if (rc != ICONV_ILLEGAL_SEQ)
    {
      memcpy(info, &infos[*tocode], sizeof(struct Content));
      mutt_update_content_info(info, &states[*tocode], 0, 0); /* EOF */
    }
  }
 
  FREE(&cd);
  FREE(&infos);
  FREE(&score);
  FREE(&states);
 
  return rc;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_get_content_info()

struct Content * mutt_get_content_info	(	const char *	fname,
		struct Body *	b,
		struct ConfigSubset *	sub )

Analyze file to determine MIME encoding to use.

Parameters

fname	File to examine
b	Body to update
sub	Config Subset

Return values

ptr	Newly allocated Content

Also set the body charset, sometimes, or not.

Definition at line 188 of file content_info.c.

{
  struct Content *info = NULL;
  struct ContentState cstate = { 0 };
  FILE *fp = NULL;
  char *fromcode = NULL;
  char *tocode = NULL;
  char buf[100] = { 0 };
  size_t r;
 
  struct stat st = { 0 };
 
  if (b && !fname)
    fname = b->filename;
  if (!fname)
    return NULL;
 
  if (stat(fname, &st) == -1)
  {
    mutt_error(_("Can't stat %s: %s"), fname, strerror(errno));
    return NULL;
  }
 
  if (!S_ISREG(st.st_mode))
  {
    mutt_error(_("%s isn't a regular file"), fname);
    return NULL;
  }
 
  fp = mutt_file_fopen(fname, "r");
  if (!fp)
  {
    mutt_debug(LL_DEBUG1, "%s: %s (errno %d)\n", fname, strerror(errno), errno);
    return NULL;
  }
 
  info = MUTT_MEM_CALLOC(1, struct Content);
 
  const char *const c_charset = cc_charset();
  if (b && (b->type == TYPE_TEXT) && (!b->noconv && !b->force_charset))
  {
    const struct Slist *const c_attach_charset = cs_subset_slist(sub, "attach_charset");
    const struct Slist *const c_send_charset = cs_subset_slist(sub, "send_charset");
    struct Slist *c_charset_slist = slist_parse(c_charset, D_SLIST_SEP_COLON);
 
    const struct Slist *fchs = b->use_disp ?
                                   (c_attach_charset ? c_attach_charset : c_charset_slist) :
                                   c_charset_slist;
 
    struct Slist *chs = slist_parse(mutt_param_get(&b->parameter, "charset"), D_SLIST_SEP_COLON);
 
    if (c_charset && (chs || c_send_charset) &&
        (mutt_convert_file_from_to(fp, fchs, chs ? chs : c_send_charset, &fromcode,
                                   &tocode, info) != ICONV_ILLEGAL_SEQ))
    {
      if (!chs)
      {
        char chsbuf[256] = { 0 };
        mutt_ch_canonical_charset(chsbuf, sizeof(chsbuf), tocode);
        mutt_param_set(&b->parameter, "charset", chsbuf);
      }
      FREE(&b->charset);
      b->charset = mutt_str_dup(fromcode);
      FREE(&tocode);
      mutt_file_fclose(&fp);
      slist_free(&c_charset_slist);
      slist_free(&chs);
      return info;
    }
 
    slist_free(&c_charset_slist);
    slist_free(&chs);
  }
 
  rewind(fp);
  while ((r = fread(buf, 1, sizeof(buf), fp)))
    mutt_update_content_info(info, &cstate, buf, r);
  mutt_update_content_info(info, &cstate, 0, 0);
 
  mutt_file_fclose(&fp);
 
  if (b && (b->type == TYPE_TEXT) && (!b->noconv && !b->force_charset))
  {
    mutt_param_set(&b->parameter, "charset",
                   (!info->hibin                                 ? "us-ascii" :
                    c_charset && !mutt_ch_is_us_ascii(c_charset) ? c_charset :
                                                                   "unknown-8bit"));
  }
 
  return info;
}

Here is the call graph for this function:

Here is the caller graph for this function:

◆ mutt_update_content_info()

void mutt_update_content_info	(	struct Content *	info,
		struct ContentState *	s,
		char *	buf,
		size_t	buflen )

Cache some info about an email.

Parameters

info	Info about an Attachment
s	Info about the Body of an email
buf	Buffer for the result
buflen	Length of the buffer

Definition at line 49 of file content_info.c.

{
  bool from = s->from;
  int whitespace = s->whitespace;
  bool dot = s->dot;
  int linelen = s->linelen;
  bool was_cr = s->was_cr;
 
  if (!buf) /* This signals EOF */
  {
    if (was_cr)
      info->binary = true;
    if (linelen > info->linemax)
      info->linemax = linelen;
 
    return;
  }
 
  for (; buflen; buf++, buflen--)
  {
    char ch = *buf;
 
    if (was_cr)
    {
      was_cr = false;
      if (ch == '\n')
      {
        if (whitespace)
          info->space = true;
        if (dot)
          info->dot = true;
        if (linelen > info->linemax)
          info->linemax = linelen;
        whitespace = 0;
        dot = false;
        linelen = 0;
        continue;
      }
 
      info->binary = true;
    }
 
    linelen++;
    if (ch == '\n')
    {
      info->crlf++;
      if (whitespace)
        info->space = true;
      if (dot)
        info->dot = true;
      if (linelen > info->linemax)
        info->linemax = linelen;
      whitespace = 0;
      linelen = 0;
      dot = false;
    }
    else if (ch == '\r')
    {
      info->crlf++;
      info->cr = true;
      was_cr = true;
      continue;
    }
    else if (ch & 0x80)
    {
      info->hibin++;
    }
    else if ((ch == '\t') || (ch == '\f'))
    {
      info->ascii++;
      whitespace++;
    }
    else if (ch == 0)
    {
      info->nulbin++;
      info->lobin++;
    }
    else if ((ch < 32) || (ch == 127))
    {
      info->lobin++;
    }
    else
    {
      if (linelen == 1)
      {
        if ((ch == 'F') || (ch == 'f'))
          from = true;
        else
          from = false;
        if (ch == '.')
          dot = true;
        else
          dot = false;
      }
      else if (from)
      {
        if ((linelen == 2) && (ch != 'r'))
        {
          from = false;
        }
        else if ((linelen == 3) && (ch != 'o'))
        {
          from = false;
        }
        else if (linelen == 4)
        {
          if (ch == 'm')
            info->from = true;
          from = false;
        }
      }
      if (ch == ' ')
        whitespace++;
      info->ascii++;
    }
 
    if (linelen > 1)
      dot = false;
    if ((ch != ' ') && (ch != '\t'))
      whitespace = 0;
  }
 
  s->from = from;
  s->whitespace = whitespace;
  s->dot = dot;
  s->linelen = linelen;
  s->was_cr = was_cr;
}

Here is the caller graph for this function:

Functions

Detailed Description

Function Documentation

◆ mutt_convert_file_from_to()

◆ mutt_convert_file_to()

◆ mutt_get_content_info()

◆ mutt_update_content_info()