Leptonica  1.82.0
Image processing and image analysis suite
encoding.c
1 /*====================================================================*
2  - Copyright (C) 2001 Leptonica. All rights reserved.
3  - This software is distributed in the hope that it will be
4  - useful, but with NO WARRANTY OF ANY KIND.
5  - No author or distributor accepts responsibility to anyone for the
6  - consequences of using this software, or for whether it serves any
7  - particular purpose or works at all, unless he or she says so in
8  - writing. Everyone is granted permission to copy, modify and
9  - redistribute this source code, for commercial or non-commercial
10  - purposes, with the following restrictions: (1) the origin of this
11  - source code must not be misrepresented; (2) modified versions must
12  - be plainly marked as such; and (3) this notice may not be removed
13  - or altered from any source or modified source distribution.
14  *====================================================================*/
15 
16 /*
17  * encodings.c
18  *
19  * Base64
20  * char *encodeBase64()
21  * l_uint8 *decodeBase64()
22  * static l_int32 isBase64()
23  * static l_int32 *genReverseTab64()
24  * static void byteConvert3to4()
25  * static void byteConvert4to3()
26  *
27  * Ascii85
28  * char *encodeAscii85()
29  * l_uint8 *decodeAscii85()
30  * static l_int32 convertChunkToAscii85()
31  *
32  * char *encodeAscii85WithComp()
33  * l_uint8 *decodeAscii85WithComp()
34  *
35  * String reformatting for base 64 encoded data
36  * char *reformatPacked64()
37  *
38  * Base64 encoding is useful for encding binary data in a restricted set of
39  * 64 printable ascii symbols, that includes the 62 alphanumerics and '+'
40  * and '/'. Notably it does not include quotes, so that base64 encoded
41  * strings can be used in situations where quotes are used for formatting.
42  * 64 symbols was chosen because it is the smallest number that can be used
43  * in 4-for-3 byte encoding of binary data:
44  * log2(64) / log2(256) = 0.75 = 3/4
45  *
46  * Ascii85 encoding is used in PostScript and some pdf files for
47  * representing binary data (for example, a compressed image) in printable
48  * ascii symbols. It has a dictionary of 85 symbols; 85 was chosen because
49  * it is the smallest number that can be used in 5-for-4 byte encoding
50  * of binary data (256 possible input values). This can be seen from
51  * the max information content in such a sequence:
52  * log2(84) / log2(256) = 0.799 < 4/5
53  * log2(85) / log2(256) = 0.801 > 4/5
54  */
55 
56 #ifdef HAVE_CONFIG_H
57 #include <config_auto.h>
58 #endif /* HAVE_CONFIG_H */
59 
60 #include <ctype.h>
61 #include <string.h>
62 #include "allheaders.h"
63 
64  /* Base64 encoding table in string representation */
65 static const l_int32 MAX_BASE64_LINE = 72; /* max line length base64 */
66 static const char *tablechar64 =
67  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
68  "abcdefghijklmnopqrstuvwxyz"
69  "0123456789+/";
70 
71 static l_int32 isBase64(char);
72 static l_int32 *genReverseTab64(void);
73 static void byteConvert3to4(l_uint8 *in3, l_uint8 *out4);
74 static void byteConvert4to3(l_uint8 *in4, l_uint8 *out3);
75 
76  /* Ascii85 encoding */
77 static const l_int32 MAX_ASCII85_LINE = 64; /* max line length ascii85 */
78 static const l_uint32 power85[5] = {1,
79  85,
80  85 * 85,
81  85 * 85 * 85,
82  85 * 85 * 85 * 85};
83 
84 static l_int32 convertChunkToAscii85(const l_uint8 *inarray, size_t insize,
85  l_int32 *pindex, char *outbuf,
86  l_int32 *pnbout);
87 
88 /*-------------------------------------------------------------*
89  * Utility for encoding and decoding data with base64 *
90  *-------------------------------------------------------------*/
106 char *
107 encodeBase64(const l_uint8 *inarray,
108  l_int32 insize,
109  l_int32 *poutsize)
110 {
111 char *chara;
112 const l_uint8 *bytea;
113 l_uint8 array3[3], array4[4];
114 l_int32 outsize, i, j, index, linecount;
115 
116  PROCNAME("encodeBase64");
117 
118  if (!poutsize)
119  return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
120  *poutsize = 0;
121  if (!inarray)
122  return (char *)ERROR_PTR("inarray not defined", procName, NULL);
123  if (insize <= 0)
124  return (char *)ERROR_PTR("insize not > 0", procName, NULL);
125 
126  /* The output array is padded to a multiple of 4 bytes, not
127  * counting the newlines. We just need to allocate a large
128  * enough array, and add 4 bytes to make sure it is big enough. */
129  outsize = 4 * ((insize + 2) / 3); /* without newlines */
130  outsize += outsize / MAX_BASE64_LINE + 4; /* with the newlines */
131  if ((chara = (char *)LEPT_CALLOC(outsize, sizeof(char))) == NULL)
132  return (char *)ERROR_PTR("chara not made", procName, NULL);
133 
134  /* Read all the input data, and convert in sets of 3 input
135  * bytes --> 4 output bytes. */
136  i = index = linecount = 0;
137  bytea = inarray;
138  while (insize--) {
139  if (linecount == MAX_BASE64_LINE) {
140  chara[index++] = '\n';
141  linecount = 0;
142  }
143  array3[i++] = *bytea++;
144  if (i == 3) { /* convert 3 to 4 and save */
145  byteConvert3to4(array3, array4);
146  for (j = 0; j < 4; j++)
147  chara[index++] = tablechar64[array4[j]];
148  i = 0;
149  linecount += 4;
150  }
151  }
152 
153  /* Suppose 1 or 2 bytes has been read but not yet processed.
154  * If 1 byte has been read, this will generate 2 bytes of
155  * output, with 6 bits to the first byte and 2 bits to the second.
156  * We will add two bytes of '=' for padding.
157  * If 2 bytes has been read, this will generate 3 bytes of output,
158  * with 6 bits to the first 2 bytes and 4 bits to the third, and
159  * we add a fourth padding byte ('='). */
160  if (i > 0) { /* left-over 1 or 2 input bytes */
161  for (j = i; j < 3; j++)
162  array3[j] = '\0'; /* zero the remaining input bytes */
163  byteConvert3to4(array3, array4);
164  for (j = 0; j <= i; j++)
165  chara[index++] = tablechar64[array4[j]];
166  for (j = i + 1; j < 4; j++)
167  chara[index++] = '=';
168  }
169  *poutsize = index;
170 
171  return chara;
172 }
173 
174 
194 l_uint8 *
195 decodeBase64(const char *inarray,
196  l_int32 insize,
197  l_int32 *poutsize)
198 {
199 char inchar;
200 l_uint8 *bytea;
201 l_uint8 array3[3], array4[4];
202 l_int32 *rtable64;
203 l_int32 i, j, outsize, in_index, out_index;
204 
205  PROCNAME("decodeBase64");
206 
207  if (!poutsize)
208  return (l_uint8 *)ERROR_PTR("&outsize not defined", procName, NULL);
209  *poutsize = 0;
210  if (!inarray)
211  return (l_uint8 *)ERROR_PTR("inarray not defined", procName, NULL);
212  if (insize <= 0)
213  return (l_uint8 *)ERROR_PTR("insize not > 0", procName, NULL);
214 
215  /* Validate the input data */
216  for (i = 0; i < insize; i++) {
217  inchar = inarray[i];
218  if (inchar == '\n') continue;
219  if (isBase64(inchar) == 0 && inchar != '=')
220  return (l_uint8 *)ERROR_PTR("invalid char in inarray",
221  procName, NULL);
222  }
223 
224  /* The input array typically is made with a newline every
225  * MAX_BASE64_LINE input bytes. However, as a printed string, the
226  * newlines would be stripped. So when we allocate the output
227  * array, assume the input array is all data, but strip
228  * out the newlines during decoding. This guarantees that
229  * the allocated array is large enough. */
230  outsize = 3 * ((insize + 3) / 4) + 4;
231  if ((bytea = (l_uint8 *)LEPT_CALLOC(outsize, sizeof(l_uint8))) == NULL)
232  return (l_uint8 *)ERROR_PTR("bytea not made", procName, NULL);
233 
234  /* The number of encoded input data bytes is always a multiple of 4.
235  * Read all the data, until you reach either the end or
236  * the first pad character '='. The data is processed in
237  * units of 4 input bytes, generating 3 output decoded bytes
238  * of binary data. Newlines are ignored. If there are no
239  * pad bytes, i == 0 at the end of this section. */
240  rtable64 = genReverseTab64();
241  i = in_index = out_index = 0;
242  for (in_index = 0; in_index < insize; in_index++) {
243  inchar = inarray[in_index];
244  if (inchar == '\n') continue;
245  if (inchar == '=') break;
246  array4[i++] = rtable64[(unsigned char)inchar];
247  if (i < 4) {
248  continue;
249  } else { /* i == 4; convert 4 to 3 and save */
250  byteConvert4to3(array4, array3);
251  for (j = 0; j < 3; j++)
252  bytea[out_index++] = array3[j];
253  i = 0;
254  }
255  }
256 
257  /* If i > 0, we ran into pad bytes ('='). If i == 2, there are
258  * two input pad bytes and one output data byte. If i == 3,
259  * there is one input pad byte and two output data bytes. */
260  if (i > 0) {
261  for (j = i; j < 4; j++)
262  array4[j] = '\0'; /* zero the remaining input bytes */
263  byteConvert4to3(array4, array3);
264  for (j = 0; j < i - 1; j++)
265  bytea[out_index++] = array3[j];
266  }
267  *poutsize = out_index;
268 
269  LEPT_FREE(rtable64);
270  return bytea;
271 }
272 
273 
277 static l_int32
278 isBase64(char c)
279 {
280  return (isalnum(((int)c)) || ((c) == '+') || ((c) == '/')) ? 1 : 0;
281 }
282 
286 static l_int32 *
287 genReverseTab64()
288 {
289 l_int32 i;
290 l_int32 *rtable64;
291 
292  rtable64 = (l_int32 *)LEPT_CALLOC(128, sizeof(l_int32));
293  for (i = 0; i < 64; i++) {
294  rtable64[(unsigned char)tablechar64[i]] = i;
295  }
296  return rtable64;
297 }
298 
302 static void
303 byteConvert3to4(l_uint8 *in3,
304  l_uint8 *out4)
305 {
306  out4[0] = in3[0] >> 2;
307  out4[1] = ((in3[0] & 0x03) << 4) | (in3[1] >> 4);
308  out4[2] = ((in3[1] & 0x0f) << 2) | (in3[2] >> 6);
309  out4[3] = in3[2] & 0x3f;
310  return;
311 }
312 
316 static void
317 byteConvert4to3(l_uint8 *in4,
318  l_uint8 *out3)
319 {
320  out3[0] = (in4[0] << 2) | (in4[1] >> 4);
321  out3[1] = ((in4[1] & 0x0f) << 4) | (in4[2] >> 2);
322  out3[2] = ((in4[2] & 0x03) << 6) | in4[3];
323  return;
324 }
325 
326 
327 /*-------------------------------------------------------------*
328  * Utility for encoding and decoding data with ascii85 *
329  *-------------------------------------------------------------*/
345 char *
346 encodeAscii85(const l_uint8 *inarray,
347  size_t insize,
348  size_t *poutsize)
349 {
350 char *chara;
351 char outbuf[8];
352 l_int32 maxsize, i, index, linecount, nbout, eof;
353 size_t outindex;
354 
355  PROCNAME("encodeAscii85");
356 
357  if (!poutsize)
358  return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
359  *poutsize = 0;
360  if (!inarray)
361  return (char *)ERROR_PTR("inarray not defined", procName, NULL);
362  if (insize <= 0)
363  return (char *)ERROR_PTR("insize not > 0", procName, NULL);
364 
365  /* Accumulate results in char array */
366  maxsize = (l_int32)(80. + (insize * 5. / 4.) *
367  (1. + 2. / MAX_ASCII85_LINE));
368  if ((chara = (char *)LEPT_CALLOC(maxsize, sizeof(char))) == NULL)
369  return (char *)ERROR_PTR("chara not made", procName, NULL);
370 
371  linecount = 0;
372  index = 0;
373  outindex = 0;
374  while (1) {
375  eof = convertChunkToAscii85(inarray, insize, &index, outbuf, &nbout);
376  for (i = 0; i < nbout; i++) {
377  chara[outindex++] = outbuf[i];
378  linecount++;
379  if (linecount >= MAX_ASCII85_LINE) {
380  chara[outindex++] = '\n';
381  linecount = 0;
382  }
383  }
384  if (eof == TRUE) {
385  if (linecount != 0)
386  chara[outindex++] = '\n';
387  chara[outindex++] = '~';
388  chara[outindex++] = '>';
389  chara[outindex++] = '\n';
390  break;
391  }
392  }
393 
394  *poutsize = outindex;
395  return chara;
396 }
397 
398 
415 static l_int32
416 convertChunkToAscii85(const l_uint8 *inarray,
417  size_t insize,
418  l_int32 *pindex,
419  char *outbuf,
420  l_int32 *pnbout)
421 {
422 l_uint8 inbyte;
423 l_uint32 inword, val;
424 l_int32 eof, index, nread, nbout, i;
425 
426  eof = FALSE;
427  index = *pindex;
428  nread = L_MIN(4, (insize - index));
429  if (insize == index + nread)
430  eof = TRUE;
431  *pindex += nread; /* save new index */
432 
433  /* Read input data and save in l_uint32 */
434  inword = 0;
435  for (i = 0; i < nread; i++) {
436  inbyte = inarray[index + i];
437  inword += (l_uint32)inbyte << (8 * (3 - i));
438  }
439 
440 #if 0
441  lept_stderr("index = %d, nread = %d\n", index, nread);
442  lept_stderr("inword = %x\n", inword);
443  lept_stderr("eof = %d\n", eof);
444 #endif
445 
446  /* Special case: output 1 byte only */
447  if (inword == 0) {
448  outbuf[0] = 'z';
449  nbout = 1;
450  } else { /* output nread + 1 bytes */
451  for (i = 4; i >= 4 - nread; i--) {
452  val = inword / power85[i];
453  outbuf[4 - i] = (l_uint8)(val + '!');
454  inword -= val * power85[i];
455  }
456  nbout = nread + 1;
457  }
458  *pnbout = nbout;
459 
460  return eof;
461 }
462 
463 
480 l_uint8 *
481 decodeAscii85(const char *inarray,
482  size_t insize,
483  size_t *poutsize)
484 {
485 char inc;
486 const char *pin;
487 l_uint8 val;
488 l_uint8 *outa;
489 l_int32 maxsize, ocount, bytecount, index;
490 l_uint32 oword;
491 
492  PROCNAME("decodeAscii85");
493 
494  if (!poutsize)
495  return (l_uint8 *)ERROR_PTR("&outsize not defined", procName, NULL);
496  *poutsize = 0;
497  if (!inarray)
498  return (l_uint8 *)ERROR_PTR("inarray not defined", procName, NULL);
499  if (insize <= 0)
500  return (l_uint8 *)ERROR_PTR("insize not > 0", procName, NULL);
501 
502  /* Accumulate results in outa */
503  maxsize = (l_int32)(80. + (insize * 4. / 5.)); /* plenty big */
504  if ((outa = (l_uint8 *)LEPT_CALLOC(maxsize, sizeof(l_uint8))) == NULL)
505  return (l_uint8 *)ERROR_PTR("outa not made", procName, NULL);
506 
507  pin = inarray;
508  ocount = 0; /* byte index into outa */
509  oword = 0;
510  for (index = 0, bytecount = 0; index < insize; index++, pin++) {
511  inc = *pin;
512 
513  if (inc == ' ' || inc == '\t' || inc == '\n' ||
514  inc == '\f' || inc == '\r' || inc == '\v') /* ignore white space */
515  continue;
516 
517  val = inc - '!';
518  if (val < 85) {
519  oword = oword * 85 + val;
520  if (bytecount < 4) {
521  bytecount++;
522  } else { /* we have all 5 input chars for the oword */
523  outa[ocount] = (oword >> 24) & 0xff;
524  outa[ocount + 1] = (oword >> 16) & 0xff;
525  outa[ocount + 2] = (oword >> 8) & 0xff;
526  outa[ocount + 3] = oword & 0xff;
527  ocount += 4;
528  bytecount = 0;
529  oword = 0;
530  }
531  } else if (inc == 'z' && bytecount == 0) {
532  outa[ocount] = 0;
533  outa[ocount + 1] = 0;
534  outa[ocount + 2] = 0;
535  outa[ocount + 3] = 0;
536  ocount += 4;
537  } else if (inc == '~') { /* end of data */
538  L_INFO(" %d extra bytes output\n", procName, bytecount - 1);
539  switch (bytecount) {
540  case 0: /* normal eof */
541  case 1: /* error */
542  break;
543  case 2: /* 1 extra byte */
544  oword = oword * power85[3] + 0xffffff;
545  outa[ocount] = (oword >> 24) & 0xff;
546  break;
547  case 3: /* 2 extra bytes */
548  oword = oword * power85[2] + 0xffff;
549  outa[ocount] = (oword >> 24) & 0xff;
550  outa[ocount + 1] = (oword >> 16) & 0xff;
551  break;
552  case 4: /* 3 extra bytes */
553  oword = oword * 85 + 0xff;
554  outa[ocount] = (oword >> 24) & 0xff;
555  outa[ocount + 1] = (oword >> 16) & 0xff;
556  outa[ocount + 2] = (oword >> 8) & 0xff;
557  break;
558  }
559  if (bytecount > 1)
560  ocount += (bytecount - 1);
561  break;
562  }
563  }
564  *poutsize = ocount;
565 
566  return outa;
567 }
568 
569 
585 char *
586 encodeAscii85WithComp(const l_uint8 *indata,
587  size_t insize,
588  size_t *poutsize)
589 {
590 char *outstr;
591 size_t size1;
592 l_uint8 *data1;
593 
594  PROCNAME("encodeAscii85WithComp");
595 
596  if (!poutsize)
597  return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
598  *poutsize = 0;
599  if (!indata)
600  return (char *)ERROR_PTR("indata not defined", procName, NULL);
601 
602  if ((data1 = zlibCompress(indata, insize, &size1)) == NULL)
603  return (char *)ERROR_PTR("data1 not made", procName, NULL);
604  outstr = encodeAscii85(data1, size1, poutsize);
605  LEPT_FREE(data1);
606  return outstr;
607 }
608 
609 
626 l_uint8 *
627 decodeAscii85WithComp(const char *instr,
628  size_t insize,
629  size_t *poutsize)
630 {
631 size_t size1;
632 l_uint8 *data1, *outdata;
633 
634  PROCNAME("decodeAscii85WithComp");
635 
636  if (!poutsize)
637  return (l_uint8 *)ERROR_PTR("&outsize not defined", procName, NULL);
638  *poutsize = 0;
639  if (!instr)
640  return (l_uint8 *)ERROR_PTR("instr not defined", procName, NULL);
641 
642  if (insize == 0) insize = strlen(instr);
643  if ((data1 = decodeAscii85(instr, insize, &size1)) == NULL)
644  return (l_uint8 *)ERROR_PTR("data1 not made", procName, NULL);
645  outdata = zlibUncompress(data1, size1, poutsize);
646  LEPT_FREE(data1);
647  return outdata;
648 }
649 
650 
651 /*-------------------------------------------------------------*
652  * String reformatting for base 64 encoded data *
653  *-------------------------------------------------------------*/
675 char *
676 reformatPacked64(const char *inarray,
677  l_int32 insize,
678  l_int32 leadspace,
679  l_int32 linechars,
680  l_int32 addquotes,
681  l_int32 *poutsize)
682 {
683 char *flata, *outa;
684 l_int32 i, j, flatindex, flatsize, outindex, nlines, linewithpad, linecount;
685 
686  PROCNAME("reformatPacked64");
687 
688  if (!poutsize)
689  return (char *)ERROR_PTR("&outsize not defined", procName, NULL);
690  *poutsize = 0;
691  if (!inarray)
692  return (char *)ERROR_PTR("inarray not defined", procName, NULL);
693  if (insize <= 0)
694  return (char *)ERROR_PTR("insize not > 0", procName, NULL);
695  if (leadspace < 0)
696  return (char *)ERROR_PTR("leadspace must be >= 0", procName, NULL);
697  if (linechars % 4)
698  return (char *)ERROR_PTR("linechars % 4 must be 0", procName, NULL);
699 
700  /* Remove all white space */
701  if ((flata = (char *)LEPT_CALLOC(insize, sizeof(char))) == NULL)
702  return (char *)ERROR_PTR("flata not made", procName, NULL);
703  for (i = 0, flatindex = 0; i < insize; i++) {
704  if (isBase64(inarray[i]) || inarray[i] == '=')
705  flata[flatindex++] = inarray[i];
706  }
707 
708  /* Generate output string */
709  flatsize = flatindex;
710  nlines = (flatsize + linechars - 1) / linechars;
711  linewithpad = leadspace + linechars + 1; /* including newline */
712  if (addquotes) linewithpad += 2;
713  if ((outa = (char *)LEPT_CALLOC((size_t)nlines * linewithpad,
714  sizeof(char))) == NULL) {
715  LEPT_FREE(flata);
716  return (char *)ERROR_PTR("outa not made", procName, NULL);
717  }
718  for (j = 0, outindex = 0; j < leadspace; j++)
719  outa[outindex++] = ' ';
720  if (addquotes) outa[outindex++] = '"';
721  for (i = 0, linecount = 0; i < flatsize; i++) {
722  if (linecount == linechars) {
723  if (addquotes) outa[outindex++] = '"';
724  outa[outindex++] = '\n';
725  for (j = 0; j < leadspace; j++)
726  outa[outindex++] = ' ';
727  if (addquotes) outa[outindex++] = '"';
728  linecount = 0;
729  }
730  outa[outindex++] = flata[i];
731  linecount++;
732  }
733  if (addquotes) outa[outindex++] = '"';
734  *poutsize = outindex;
735 
736  LEPT_FREE(flata);
737  return outa;
738 }
void lept_stderr(const char *fmt,...)
lept_stderr()
Definition: utils1.c:306
l_uint8 * zlibUncompress(const l_uint8 *datain, size_t nin, size_t *pnout)
zlibUncompress()
Definition: zlibmem.c:196
l_uint8 * zlibCompress(const l_uint8 *datain, size_t nin, size_t *pnout)
zlibCompress()
Definition: zlibmem.c:92