]> www.wagner.pp.ru Git - oss/catdoc.git/blob - src/rtfread.c
Simular fix in reader.c
[oss/catdoc.git] / src / rtfread.c
1 /*****************************************************************/
2 /* Reading routines for rtf files                                */
3 /*                                                               */
4 /* This file is part of catdoc project                           */
5 /* (c) Victor Wagner 2003, (c) Alex Ott 2003                 */
6 /*****************************************************************/
7 #ifdef HAVE_CONFIG_H
8 #include <config.h>
9 #endif
10
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <ctype.h>
14 #include <string.h>
15 #include "catdoc.h"
16
17 /********************************************************
18  * Datatypes declaration
19  * 
20  */
21 typedef enum {
22         RTF_CODEPAGE,
23         RTF_FONT_CHARSET,
24         RTF_UC,
25         RTF_UNICODE_CHAR,
26         RTF_CHAR,
27         RTF_PARA,
28         RTF_TABLE_START,
29         RTF_TABLE_END,
30         RTF_ROW,
31         RTF_CELL,
32         RTF_UNKNOWN,
33         RTF_OVERLAY,
34         RTF_PICT,
35         RTF_F,
36         RTF_AUTHOR,
37         RTF_FONTTBL,
38         RTF_INFO,
39         RTF_STYLESHEET,
40         RTF_COLORTBL,
41         RTF_LISTOVERRIDETABLE,
42         RTF_LISTTABLE,
43         RTF_RSIDTBL,
44         RTF_GENERATOR,
45         RTF_DATAFIELD,
46         RTF_LANG,
47         RTF_PARD,
48         RTF_TAB,
49         RTF_SPEC_CHAR,
50         RTF_EMDASH,
51         RTF_ENDASH,
52         RTF_EMSPACE,
53         RTF_ENSPACE,
54         RTF_BULLET, 
55         RTF_LQUOTE,
56         RTF_RQUOTE,
57         RTF_LDBLQUOTE,
58         RTF_RDBLQUOTE,
59         RTF_ZWNONJOINER,
60 } RTFTypes;
61
62 typedef struct {
63         char *name;
64         RTFTypes type;
65 } RTFTypeMap;
66
67 RTFTypeMap rtf_types[]={
68         {"uc",RTF_UC},
69         {"ansicpg",RTF_CODEPAGE},
70         {"pard",RTF_PARD},
71         {"par",RTF_PARA},
72         {"cell",RTF_CELL},
73         {"row",RTF_ROW},
74         {"overlay",RTF_OVERLAY}, 
75         {"pict",RTF_PICT},
76         {"author",RTF_AUTHOR},
77         {"f",RTF_F}, 
78         {"fonttbl",RTF_FONTTBL}, 
79         {"info",RTF_INFO}, 
80         {"stylesheet",RTF_STYLESHEET},
81         {"colortbl",RTF_COLORTBL},
82         {"listtable",RTF_LISTTABLE},
83         {"listoverridetable",RTF_LISTOVERRIDETABLE},
84         {"rsidtbl",RTF_RSIDTBL}, 
85         {"generator",RTF_GENERATOR}, 
86         {"datafield",RTF_DATAFIELD}, 
87         {"lang",RTF_LANG}, 
88         {"tab",RTF_TAB}, 
89         {"emdash",RTF_EMDASH},
90         {"endash",RTF_ENDASH},
91         {"emspace",RTF_EMDASH},
92         {"enspace",RTF_ENDASH},
93         {"bullet",RTF_BULLET}, 
94         {"lquote",RTF_LQUOTE},
95         {"rquote",RTF_RQUOTE},
96         {"ldblquote",RTF_LDBLQUOTE},
97         {"rdblquote",RTF_RDBLQUOTE},
98         {"zwnj",RTF_ZWNONJOINER},
99 /*      {"",}, */
100 /*      {"",}, */
101         {"u",RTF_UNICODE_CHAR}
102 };
103
104 #define RTFNAMEMAXLEN 32
105 #define RTFARGSMAXLEN 64
106
107 /**
108  * Structure describing rtf command
109  * 
110  */
111 typedef struct {
112         RTFTypes type;
113         char name[RTFNAMEMAXLEN+1];
114         signed int numarg;
115 /*      void *args; */
116 } RTFcommand;
117
118
119 #define MAXFONTNAME 64
120 /**
121  * 
122  * 
123  */
124 typedef struct {
125         int name;
126         char fontname[MAXFONTNAME+1];
127 } RTFFont;
128
129 /**
130  * Structure to describe style
131  * 
132  */
133 typedef struct {
134         int codepage;
135 } RTFStyle;
136
137 /**
138  * Structure to store values, local to rtf group
139  * 
140  */
141 typedef struct {
142         int uc;                                         /**< How much symbols to skip */
143         RTFStyle* style;                        /**< curren style */
144 } RTFGroupData;
145
146 /********************************************************
147  * Functions declaration
148  * 
149  */
150
151 extern int forced_charset;
152 signed long getNumber(FILE *f);
153
154 int getRtfCommand(FILE *f, RTFcommand *command );
155 unsigned short int rtf_to_unicode(int code);
156 RTFTypes getCommandType(char *name);
157 signed int getCharCode(FILE *f);
158 void rtfSetCharset(short int **charset_ptr,unsigned int codepage);
159
160 /********************************************************
161  * Global data
162  * 
163  */
164 short int *current_charset;
165 int rtf_level=0;
166
167 /********************************************************
168  * Functions implementation
169  * 
170  */
171 extern unsigned short int buffer[];
172 void add_to_buffer(int *bufptr,unsigned short int c) {
173         buffer[++(*bufptr)]=c;
174         if (*bufptr >= PARAGRAPH_BUFFER-2) {
175                 buffer[++(*bufptr)]=0;
176 /*****************************************************************/
177 /* Reading routines for MS-Word, MS-Write and text files         */
178 /*                                                               */
179 /* This file is part of catdoc project                           */
180 /* (c) Victor Wagner 1996-2003, (c) Alex Ott 2003                    */
181 /*****************************************************************/
182 #ifdef HAVE_CONFIG_H
183 #include <config.h>
184 #endif
185 #include <string.h>
186 #include <stdio.h>
187 #include "catdoc.h"
188 unsigned short int buffer[PARAGRAPH_BUFFER];
189 static unsigned char read_buf[256];
190 static int buf_is_unicode;
191
192 /**************************************************************************/
193 /* Just prints out content of input file. Called when file is not OLE     */
194 /* stream                                                                 */
195 /* Parameters - f - file to copy out. header - first few bytes of file,   */
196 /*  which have been already read by format recognition code, but should   */
197 /*  be output anyway                                                      */
198 /**************************************************************************/
199 void copy_out (FILE *f,char *header) {
200         char *buf=(char *)buffer;
201         int count,i;
202         long offset;
203         if (get_unicode_char == get_word8_char) {
204                 /* non-word file and -u specified. Trying to guess which kind of
205                  * unicode is used
206                  */
207                 if ((unsigned char)header[0]==0xFE && (unsigned char)header[1]==0xFF) {
208                         get_unicode_char = get_utf16msb;
209                         fputs(convert_char(header[2]<<8|header[3]),stdout); 
210                         fputs(convert_char(header[4]<<8|header[5]),stdout); 
211                         fputs(convert_char(header[6]<<8|header[7]),stdout); 
212                 } else if ((unsigned char)header[0]!=0xFF ||
213                                 (unsigned char)header[1]!=0xFE) {
214                         int c,j,d;
215                         /* if it is not utf16, assume it is UTF8. We are told -u,
216                          * aren't we */
217                         get_unicode_char = get_utf8;
218                         i=0;
219                         while (i<8) {
220                                 c=(unsigned char)header[i++];           
221                                 if (c >=0x80) {
222                                         if ( c<0xE0) {
223                                                 c=(c & 0x1F);
224                                                 count =1;
225                                         } else {
226                                                 c=(c & 0xF);
227                                                 count = 2;
228                                         }
229                                         for (j=0;j<count;j++) {
230                                                 if (i<7) {
231                                                         d=(unsigned char) header[i++];
232                                                 } else {
233                                                         d=fgetc(f);
234                                                 }
235                                                 c=c<<6 | (d & 0x3F);
236                                         }
237                                 }
238                                 fputs (convert_char(c),stdout);
239                         }
240                 } else {
241                         get_unicode_char = get_utf16lsb;
242                         fputs(convert_char(header[3]<<8|header[2]),stdout); 
243                         fputs(convert_char(header[5]<<8|header[4]),stdout); 
244                         fputs(convert_char(header[7]<<8|header[6]),stdout); 
245                 }           
246                 while (!catdoc_eof(f)) {
247                         i=get_unicode_char(f,&offset,0x7FFFFFFF); 
248                         if (i!=EOF) fputs(convert_char(i),stdout);
249                 }    
250         } else {
251                 for (i=0;i<8;i++) {
252                         fputs(convert_char(to_unicode(source_charset,(unsigned char)header[i])),stdout);
253                 }                        
254                 /* Assuming 8-bit input text */
255                 while ((count = catdoc_read(buf,1,PARAGRAPH_BUFFER,f))) {
256                         for (i=0;i<count;i++) {
257                                 fputs(convert_char(to_unicode(source_charset,
258                                                                 (unsigned char)buf[i])),stdout);
259                         }                      
260                 }
261         } 
262
263 /**************************************************************************/
264 /*  process_file - main process engine. Reads word file using function,   */
265 /*  pointed by get_unicode_char, searches for things which looks like     */
266 /*  paragraphs and print them out                                         */
267 /**************************************************************************/
268 int process_file(FILE *f,long stop) {
269         int bufptr;
270         int tabmode=0;
271         long offset=0;
272         int hyperlink_mode = 0;
273         unsigned short c;
274         /* Now we are starting to read with get_unicode_char */
275         while (!catdoc_eof(f) && offset<stop) {
276                 bufptr = -1;
277                 do {
278                         c=get_unicode_char(f,&offset,stop);
279                         /* Following symbols below 32 are allowed inside paragraph:
280                            0x0002 - footnote mark
281                            0x0007 - table separator (converted to tabmode)
282                            0x0009 - Horizontal tab ( printed as is)
283                            0x000B - hard return
284                            0x000C - page break
285                            0x000D - return - marks an end of paragraph
286                            0x001E - IS2 for some reason means short defis in Word.
287                            0x001F - soft hyphen in Word
288                            0x0013 - start embedded hyperlink
289                            0x0014 - separate hyperlink URL from text
290                            0x0015 - end embedded hyperlink
291                            */
292                         if (tabmode) {
293                                 tabmode=0;
294                                 if (c==0x007) {
295                                         buffer[++bufptr]=0x1E;
296                                         continue;
297                                 } else {
298                                         buffer[++bufptr]=0x1C;
299                                 }  
300                         }        
301                         if (c<32) {
302                                 switch (c) {
303                                         case 0x007:
304                                                 tabmode = 1;
305                                                 break;
306                                         case 0x000D:
307                                         case 0x000B:
308                                                 buffer[++bufptr]=0x000A;
309                                                 break;
310                                         case 0x000C:
311                                                 buffer[++bufptr]=c;
312                                                 break;
313                                         case 0x001E:
314                                                 buffer[++bufptr]='-';
315                                                 break;
316                                         case 0x0002: break;
317
318                                         case 0x001F:
319                                                                  buffer[++bufptr]=0xAD;/* translate to Unicode
320                                                                                                                   soft hyphen */
321                                                                  break;                                           
322                                         case 0x0009:
323                                                                  buffer[++bufptr]=c;
324                                                                  break;
325                                         case 0x0013:
326                                                                  hyperlink_mode=1;
327                                                                  buffer[++bufptr]=' ';
328                                                                  break;
329                                         case 0x0014:
330                                                                  hyperlink_mode = 0;
331                                                                  /*fall through */
332                                         case 0x0015:
333                                                                  /* just treat hyperlink separators as
334                                                                   * space */
335                                                                  buffer[++bufptr]=' ';
336                                                                  break;
337                                         case 0x0001: if (hyperlink_mode) 
338                                                                                 break;
339                                                                  /* else fall through */
340                                         default:
341                                                                  bufptr=-1; /* Any other control char - discard para*/
342                                 }
343                         } else if (c != 0xfeff) {
344                                 /* skip ZERO-WIDTH-UNBREAKABLE-SPACE. Output anything
345                                  * else*/
346                                 buffer[++bufptr]=c;
347                         }
348                 } while (bufptr<=PARAGRAPH_BUFFER-2 &&
349                                  !catdoc_eof(f) &&
350                                  buffer[bufptr]!=0x000a);
351                 if (bufptr>0) {
352                         buffer[++bufptr]=0;
353                         output_paragraph(buffer);
354                 }
355         }
356         return 0;
357 }
358 /**********************************************************************/
359 /* Reads file from MS-Word 97 and above file. Takes in account strange*
360  * situation that unicode and non-unicode 256-byte blocks could be    *
361  * intermixed in word file                                            *
362  *                                                                    *
363  * Parameters:                                                        *
364  *                                                                    *
365  * f - file to read                                                   *
366  * offset - position of the character inside file (to determine       * 
367  * possible  block boundaries                                         *
368  **********************************************************************/ 
369 int get_word8_char(FILE *f,long *offset,long fileend) {
370         int count,i,u;
371         char c;
372         if ((i=(*offset)%256) ==0) {
373                 count=catdoc_read(read_buf,1,256,f);
374                 memset(read_buf+count,0,256-count);
375                 buf_is_unicode=0;
376                 if (*offset+(long)count>fileend) {
377                         count=fileend-*offset;
378                 }       
379                 while (i<count) {
380                         c=read_buf[i++];
381                         if ((c==0x20|| c==0x0D||ispunct(c))&&i<count&&read_buf[i]==0) {
382                                 buf_is_unicode=1;
383                                 break;
384                         }
385                         i++;
386                 }   
387                 i=0;
388         }    
389         if (buf_is_unicode) {
390                 u=read_buf[i] | read_buf[i+1]<<8;
391                 (*offset)+=2;
392         } else {
393                 u=to_unicode(source_charset,read_buf[i]);
394                 (*offset)++;
395         }
396         return u;
397 }  
398
399
400                 output_paragraph(buffer);
401                 *bufptr=-1;
402         }
403 }
404
405 void end_paragraph(int *bufptr) {
406                                    add_to_buffer(bufptr,0x000a);
407                                    add_to_buffer(bufptr,0);
408                                    output_paragraph(buffer);
409                                    *bufptr=-1;
410 }                                  
411
412 /** 
413  * Parses RTF file from file stream
414  * 
415  * @param f - file stream descriptor
416  */
417 int parse_rtf(FILE *f) {
418         int para_mode=0, data_skip_mode=0,i;
419         RTFGroupData *groups=NULL;
420         int group_count=0, group_store=20;
421         int bufptr=-1;
422         current_charset=source_charset;
423         fseek(f,0,SEEK_SET);
424         if((groups=(RTFGroupData*)calloc(group_store,sizeof(RTFGroupData))) == NULL ) {
425                 perror("Can\'t allocate memory: ");
426                 return 1;
427         }
428         groups[0].uc = 2; /* DEfault uc = 2 */
429         while ( !feof(f) ) {
430                 int c = fgetc(f);
431                 if ( feof( f ) )
432                         break;
433                 switch (c) {
434                 case '\\': {
435                         int code;
436                         RTFcommand com;
437                         if ((code=getRtfCommand(f, &com)) != 0)
438                                 break;
439                         switch (com.type) {
440                         case RTF_SPEC_CHAR:
441 /*                              fprintf(stderr, "Spec Char found=%s and arg=%c\n", */
442 /*                              com.name, com.numarg); */
443                                 if (com.numarg == '*' && data_skip_mode == 0) {
444                                         data_skip_mode=group_count;
445                                 } else if (com.numarg == '\r') {
446                                         end_paragraph(&bufptr);
447                                 } else if (com.numarg == '~') {
448                                         add_to_buffer(&bufptr,0xA0);/* NO-BREAK SPACE */
449                                 } else if (com.numarg == '-') {
450                                         add_to_buffer(&bufptr,0xAD);/* Optional hyphen */
451                                 }       
452
453                                    break;
454                         case RTF_EMDASH:
455                                    add_to_buffer(&bufptr,0x2014);/* EM DASH*/
456                                    break;
457                         case RTF_ENDASH: 
458                                    add_to_buffer(&bufptr,0x2013);break;
459                         case RTF_BULLET: 
460                                    add_to_buffer(&bufptr,0x2022);break;
461                         case RTF_LQUOTE: add_to_buffer(&bufptr,0x2018);break;
462                         case RTF_RQUOTE: add_to_buffer(&bufptr,0x2019);break;
463                         case RTF_LDBLQUOTE: add_to_buffer(&bufptr,0x201C);break;
464                         case RTF_RDBLQUOTE: add_to_buffer(&bufptr,0x201D);break;
465                         case RTF_ZWNONJOINER: add_to_buffer(&bufptr,0xfeff);break;
466                         case RTF_EMSPACE:
467                         case RTF_ENSPACE:
468                                         add_to_buffer(&bufptr,' ');break;
469                         case RTF_CHAR:
470 /*                              fprintf(stderr, "RTF char %d\n", com.numarg); */
471                                 if (data_skip_mode == 0) {
472                                         add_to_buffer(&bufptr,rtf_to_unicode(com.numarg));
473                                 }       
474                                 break;
475                         case RTF_UC:
476                                 groups[group_count].uc=com.numarg;
477                                 break;
478                         case RTF_TAB:
479                                 add_to_buffer(&bufptr,0x0009);
480                                 break;
481                         case RTF_UNICODE_CHAR:
482                                 if (com.numarg < 0)
483                                         break;
484 /*                              fprintf(stderr, "Unicode char %d\n", com.numarg);  */
485                                 if (data_skip_mode == 0)
486                                         add_to_buffer(&bufptr,com.numarg);
487                                 i=groups[group_count].uc;
488                                 while((--i)>0)
489                                         fgetc(f);
490                                 break;
491                         case RTF_PARA:
492                                 /*if (para_mode > 0) {*/
493                                         end_paragraph(&bufptr); 
494                                 /*}*/   
495                                 para_mode=group_count;
496                                 break;
497                         case RTF_PICT:
498                         case RTF_FONTTBL:
499                         case RTF_INFO:
500                         case RTF_COLORTBL:
501                         case RTF_STYLESHEET:
502                         case RTF_LISTTABLE:
503                         case RTF_LISTOVERRIDETABLE:
504                         case RTF_RSIDTBL:
505                         case RTF_GENERATOR:
506                         case RTF_DATAFIELD:
507                                 if (data_skip_mode == 0){
508                                         data_skip_mode=group_count;
509                                 }
510                                 break;
511                         case RTF_LANG:
512 /*                              fprintf(stderr, "Selected lang = %d\n",com.numarg); */
513                                 break;
514                         case RTF_CODEPAGE:
515                                 rtfSetCharset(&current_charset,com.numarg);
516                         default:
517 /*                              fprintf(stderr, "Unknown command with name %s and arg=%d\n",  */
518 /*                                              com.name, com.numarg);  */
519                         ;
520                         }
521                         break;
522                 }
523                 case '{':
524                         group_count++;
525                         if (group_count >= group_store ) {
526                                 group_store+=10;
527                                 if((groups=(RTFGroupData*)realloc(groups,
528                                                                                                   group_store*sizeof(RTFGroupData)))
529                                    == NULL ) {
530                                         perror("Can\'t allocate memory: ");
531                                         return 1;
532                                 }
533                         }
534                         if (para_mode)
535                                 add_to_buffer(&bufptr,0x20);
536                         groups[group_count]=groups[group_count-1];
537                         break;
538                 case '}':
539                         group_count--;
540                         if(group_count < 0)
541                                 group_count=0;
542                         if(para_mode > 0 && para_mode > group_count) {
543                                 /*add_to_buffer(&bufptr,0);
544                                 output_paragraph(buffer);
545                                 fprintf(stderr,"\nGROUP_END para_mode=%d group_count=%d bufptr=%d\n", para_mode,group_count,bufptr);
546                                 bufptr=-1;*/
547                                 para_mode=0;
548                         }
549                         if(data_skip_mode > group_count) {
550                                 data_skip_mode=0;
551                         }
552                         break;
553                 default:
554                         if (data_skip_mode == 0)
555                                 if (c != '\n' && c != '\r')
556                                         add_to_buffer(&bufptr,rtf_to_unicode(c));
557                 }
558         }
559         if (bufptr>=0) {
560                 add_to_buffer(&bufptr,'\n');
561                 add_to_buffer(&bufptr,0);
562                 output_paragraph(buffer);
563         }       
564         free(groups);
565         return 0;
566 }  
567
568 /** 
569  * Convert text string to number
570  * 
571  * @param f stream to read data from
572  * 
573  * @return converted number
574  */
575 signed long getNumber(FILE *f) {
576         int c,count=0;
577         char buf[RTFARGSMAXLEN];
578         
579         while(isdigit(c=fgetc(f)) || c=='-') {
580                 if(feof(f))
581                         return -1;
582                 buf[count++]=(char)c;
583         }
584         ungetc(c,f);
585         buf[count]='\0';
586         return strtol(buf, (char **)NULL, 10);
587 }
588
589 /** 
590  * Parse command stream from rtf file and fill command structure
591  * 
592  * @param f - rtf file stream
593  * @param command - pointer to RTFcommand structure to fill
594  * 
595  * @return parse code not 0 - error, 0 - success
596  */
597 int getRtfCommand(FILE *f, RTFcommand *command ) {
598         int c=fgetc(f);
599         if (isalpha(c)) {
600                 int name_count=1;
601                 command->name[0]=(char)c;
602                 while(isalpha(c=fgetc(f)) && name_count < RTFNAMEMAXLEN) {
603                         if(feof(f))
604                                 return 1;
605                         command->name[name_count++]=(char)c;
606                 }
607                 command->name[name_count]='\0';
608                 command->type=getCommandType(command->name);
609 /*              command->args=NULL; */
610                 ungetc(c,f);
611                 if (isdigit(c) || c == '-' )
612                         command->numarg=getNumber(f);
613                 else
614                         command->numarg=0;
615                 c=fgetc(f);
616                 if(!(c==' ' || c=='\t'))
617                         ungetc(c,f);
618         } else {
619                 command->name[0]=(char)c;
620                 command->name[1]='\0';
621 /*              command->args=NULL; */
622                 if (c == '\'') {
623                         command->type=RTF_CHAR;
624                         command->numarg=getCharCode(f);
625                         if(feof(f))
626                                 return -1;
627                 } else {
628                         command->type=RTF_SPEC_CHAR;
629                         command->numarg=c;
630                 }
631         }
632         
633         return 0;
634 }
635
636 /** 
637  * Converts char to unicode.
638  * 
639  * @param code - integer code of char
640  * 
641  * @return converted char
642  */
643 unsigned short int rtf_to_unicode(int code) {
644         int cc=code;
645         if (code < 0 || (cc=to_unicode(current_charset, code)) < 0 ) return 0xFEFF;
646         return cc;
647 }
648
649 /** 
650  * Convert name of RTF command to RTFType
651  * 
652  * @param name name to convert
653  * 
654  * @return RTFType, if unknown command, then return RTF_UNKNOWN
655  */
656 RTFTypes getCommandType(char *name) {
657         int i, olen=sizeof(rtf_types)/sizeof(RTFTypeMap);
658         for (i = 0; i < olen ; i++) {
659                 if ( strcmp(name,rtf_types[i].name) == 0 ) {
660                         return rtf_types[i].type;
661                 }
662         }
663         return RTF_UNKNOWN;
664 }
665
666 /** 
667  * Return number representing char code in Hex
668  * 
669  * @param f stream to read data from
670  * 
671  * @return converted number
672  */
673 signed int getCharCode(FILE *f) {
674         int c,count=0,i;
675         char buf[RTFARGSMAXLEN];
676         for(i=0;i<2; i++) {
677                 if (isdigit(c=fgetc(f))||(c>='a' && c<='f')) {
678                         if(feof(f))
679                                 return -1;
680                         buf[count++]=(char)c;
681                 } else 
682                         ungetc(c,f);
683         }
684
685         buf[count]='\0';
686         return strtol(buf, (char **)NULL, 16);
687 }
688
689 void rtfSetCharset(short int **charset_ptr,unsigned int codepage)
690 {
691         /* Do not override charset if it is specified in the command line */
692         const char *charset_name;
693         char *save_buf = input_buffer;
694         if (forced_charset) return;
695         charset_name = charset_from_codepage(codepage);
696         check_charset(&source_csname,charset_name);
697         input_buffer=NULL;
698         *charset_ptr = read_charset(source_csname);     
699         input_buffer = save_buf;
700 }