]> www.wagner.pp.ru Git - oss/catdoc.git/blob - src/rtfread.c
Add detection of ZIP-archive and report that this type of file (i.e. OOXML or OpenDoc...
[oss/catdoc.git] / src / rtfread.c
1 /*****************************************************************/
2 /* Reading routines for rtf files                                */
3 /*                                                               */
4 /* This file is part of catdoc project                           */
5 /* (c) Victor Wagner 2003, (c) Alex Ott 2003                 */
6 /*****************************************************************/
7 #ifdef HAVE_CONFIG_H
8 #include <config.h>
9 #endif
10
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <ctype.h>
14 #include <string.h>
15 #include "catdoc.h"
16
17 /********************************************************
18  * Datatypes declaration
19  * 
20  */
21 typedef enum {
22         RTF_CODEPAGE,
23         RTF_FONT_CHARSET,
24         RTF_UC,
25         RTF_UNICODE_CHAR,
26         RTF_CHAR,
27         RTF_PARA,
28         RTF_TABLE_START,
29         RTF_TABLE_END,
30         RTF_ROW,
31         RTF_CELL,
32         RTF_UNKNOWN,
33         RTF_OVERLAY,
34         RTF_PICT,
35         RTF_F,
36         RTF_AUTHOR,
37         RTF_FONTTBL,
38         RTF_INFO,
39         RTF_STYLESHEET,
40         RTF_COLORTBL,
41         RTF_LISTOVERRIDETABLE,
42         RTF_LISTTABLE,
43         RTF_RSIDTBL,
44         RTF_GENERATOR,
45         RTF_DATAFIELD,
46         RTF_LANG,
47         RTF_PARD,
48         RTF_TAB,
49         RTF_SPEC_CHAR,
50         RTF_EMDASH,
51         RTF_ENDASH,
52         RTF_EMSPACE,
53         RTF_ENSPACE,
54         RTF_BULLET, 
55         RTF_LQUOTE,
56         RTF_RQUOTE,
57         RTF_LDBLQUOTE,
58         RTF_RDBLQUOTE,
59         RTF_ZWNONJOINER,
60 } RTFTypes;
61
62 typedef struct {
63         char *name;
64         RTFTypes type;
65 } RTFTypeMap;
66
67 RTFTypeMap rtf_types[]={
68         {"uc",RTF_UC},
69         {"ansicpg",RTF_CODEPAGE},
70         {"pard",RTF_PARD},
71         {"par",RTF_PARA},
72         {"cell",RTF_CELL},
73         {"row",RTF_ROW},
74         {"overlay",RTF_OVERLAY}, 
75         {"pict",RTF_PICT},
76         {"author",RTF_AUTHOR},
77         {"f",RTF_F}, 
78         {"fonttbl",RTF_FONTTBL}, 
79         {"info",RTF_INFO}, 
80         {"stylesheet",RTF_STYLESHEET},
81         {"colortbl",RTF_COLORTBL},
82         {"listtable",RTF_LISTTABLE},
83         {"listoverridetable",RTF_LISTOVERRIDETABLE},
84         {"rsidtbl",RTF_RSIDTBL}, 
85         {"generator",RTF_GENERATOR}, 
86         {"datafield",RTF_DATAFIELD}, 
87         {"lang",RTF_LANG}, 
88         {"tab",RTF_TAB}, 
89         {"emdash",RTF_EMDASH},
90         {"endash",RTF_ENDASH},
91         {"emspace",RTF_EMDASH},
92         {"enspace",RTF_ENDASH},
93         {"bullet",RTF_BULLET}, 
94         {"lquote",RTF_LQUOTE},
95         {"rquote",RTF_RQUOTE},
96         {"ldblquote",RTF_LDBLQUOTE},
97         {"rdblquote",RTF_RDBLQUOTE},
98         {"zwnj",RTF_ZWNONJOINER},
99 /*      {"",}, */
100 /*      {"",}, */
101         {"u",RTF_UNICODE_CHAR}
102 };
103
104 #define RTFNAMEMAXLEN 32
105 #define RTFARGSMAXLEN 64
106 #define MAX_DIGITS_IN_NUMBER 10
107
108 /**
109  * Structure describing rtf command
110  * 
111  */
112 typedef struct {
113         RTFTypes type;
114         char name[RTFNAMEMAXLEN+1];
115         signed int numarg;
116 /*      void *args; */
117 } RTFcommand;
118
119
120 #define MAXFONTNAME 64
121 /**
122  * 
123  * 
124  */
125 typedef struct {
126         int name;
127         char fontname[MAXFONTNAME+1];
128 } RTFFont;
129
130 /**
131  * Structure to describe style
132  * 
133  */
134 typedef struct {
135         int codepage;
136 } RTFStyle;
137
138 /**
139  * Structure to store values, local to rtf group
140  * 
141  */
142 typedef struct {
143         int uc;                                         /**< How much symbols to skip */
144         RTFStyle* style;                        /**< curren style */
145 } RTFGroupData;
146
147 /********************************************************
148  * Functions declaration
149  * 
150  */
151
152 extern int forced_charset;
153 signed long getNumber(FILE *f);
154
155 int getRtfCommand(FILE *f, RTFcommand *command );
156 unsigned short int rtf_to_unicode(int code);
157 RTFTypes getCommandType(char *name);
158 signed int getCharCode(FILE *f);
159 void rtfSetCharset(short int **charset_ptr,unsigned int codepage);
160
161 /********************************************************
162  * Global data
163  * 
164  */
165 short int *current_charset;
166 int rtf_level=0;
167
168 /********************************************************
169  * Functions implementation
170  * 
171  */
172 extern unsigned short int buffer[];
173 void add_to_buffer(int *bufptr,unsigned short int c) {
174         buffer[++(*bufptr)]=c;
175         if (*bufptr >= PARAGRAPH_BUFFER-2) {
176                 buffer[++(*bufptr)]=0;
177                 output_paragraph(buffer);
178                 *bufptr=-1;
179         }
180 }
181
182 void end_paragraph(int *bufptr) {
183                                    add_to_buffer(bufptr,0x000a);
184                                    add_to_buffer(bufptr,0);
185                                    output_paragraph(buffer);
186                                    *bufptr=-1;
187 }                                  
188
189 /** 
190  * Parses RTF file from file stream
191  * 
192  * @param f - file stream descriptor
193  */
194 int parse_rtf(FILE *f) {
195         int para_mode=0, data_skip_mode=0,i;
196         RTFGroupData *groups=NULL;
197         int group_count=0, group_store=20;
198         int bufptr=-1;
199         current_charset=source_charset;
200         fseek(f,0,SEEK_SET);
201         if((groups=(RTFGroupData*)calloc(group_store,sizeof(RTFGroupData))) == NULL ) {
202                 perror("Can\'t allocate memory: ");
203                 return 1;
204         }
205         groups[0].uc = 2; /* DEfault uc = 2 */
206         while ( !feof(f) ) {
207                 int c = fgetc(f);
208                 if ( feof( f ) )
209                         break;
210                 switch (c) {
211                 case '\\': {
212                         int code;
213                         RTFcommand com;
214                         if ((code=getRtfCommand(f, &com)) != 0)
215                                 break;
216                         switch (com.type) {
217                         case RTF_SPEC_CHAR:
218 /*                              fprintf(stderr, "Spec Char found=%s and arg=%c\n", */
219 /*                              com.name, com.numarg); */
220                                 if (com.numarg == '*' && data_skip_mode == 0) {
221                                         data_skip_mode=group_count;
222                                 } else if (com.numarg == '\r') {
223                                         end_paragraph(&bufptr);
224                                 } else if (com.numarg == '~') {
225                                         add_to_buffer(&bufptr,0xA0);/* NO-BREAK SPACE */
226                                 } else if (com.numarg == '-') {
227                                         add_to_buffer(&bufptr,0xAD);/* Optional hyphen */
228                                 }       
229
230                                    break;
231                         case RTF_EMDASH:
232                                    add_to_buffer(&bufptr,0x2014);/* EM DASH*/
233                                    break;
234                         case RTF_ENDASH: 
235                                    add_to_buffer(&bufptr,0x2013);break;
236                         case RTF_BULLET: 
237                                    add_to_buffer(&bufptr,0x2022);break;
238                         case RTF_LQUOTE: add_to_buffer(&bufptr,0x2018);break;
239                         case RTF_RQUOTE: add_to_buffer(&bufptr,0x2019);break;
240                         case RTF_LDBLQUOTE: add_to_buffer(&bufptr,0x201C);break;
241                         case RTF_RDBLQUOTE: add_to_buffer(&bufptr,0x201D);break;
242                         case RTF_ZWNONJOINER: add_to_buffer(&bufptr,0xfeff);break;
243                         case RTF_EMSPACE:
244                         case RTF_ENSPACE:
245                                         add_to_buffer(&bufptr,' ');break;
246                         case RTF_CHAR:
247 /*                              fprintf(stderr, "RTF char %d\n", com.numarg); */
248                                 if (data_skip_mode == 0) {
249                                         add_to_buffer(&bufptr,rtf_to_unicode(com.numarg));
250                                 }       
251                                 break;
252                         case RTF_UC:
253                                 groups[group_count].uc=com.numarg;
254                                 break;
255                         case RTF_TAB:
256                                 add_to_buffer(&bufptr,0x0009);
257                                 break;
258                         case RTF_UNICODE_CHAR:
259                                 if (com.numarg < 0)
260                                         break;
261 /*                              fprintf(stderr, "Unicode char %d\n", com.numarg);  */
262                                 if (data_skip_mode == 0)
263                                         add_to_buffer(&bufptr,com.numarg);
264                                 i=groups[group_count].uc;
265                                 while((--i)>0) {
266                                         int c=fgetc(f);
267                                         if (c == '\\') {
268                                                 c = fgetc(f);
269                                                 switch (c) {
270                                                  case '\\': break;
271                                                  case '\'':
272                                                         /* skip two hex digits */
273                                                         fgetc(f);
274                                                         fgetc(f);
275                                                         break;
276                                                 default:
277                                                         break;
278                                                 }
279                                         }               
280                                 }       
281                                         
282                                 break;
283                         case RTF_PARA:
284                                 /*if (para_mode > 0) {*/
285                                         end_paragraph(&bufptr); 
286                                 /*}*/   
287                                 para_mode=group_count;
288                                 break;
289                         case RTF_PICT:
290                         case RTF_FONTTBL:
291                         case RTF_INFO:
292                         case RTF_COLORTBL:
293                         case RTF_STYLESHEET:
294                         case RTF_LISTTABLE:
295                         case RTF_LISTOVERRIDETABLE:
296                         case RTF_RSIDTBL:
297                         case RTF_GENERATOR:
298                         case RTF_DATAFIELD:
299                                 if (data_skip_mode == 0){
300                                         data_skip_mode=group_count;
301                                 }
302                                 break;
303                         case RTF_LANG:
304 /*                              fprintf(stderr, "Selected lang = %d\n",com.numarg); */
305                                 break;
306                         case RTF_CODEPAGE:
307                                 rtfSetCharset(&current_charset,com.numarg);
308                         default:
309 /*                              fprintf(stderr, "Unknown command with name %s and arg=%d\n",  */
310 /*                                              com.name, com.numarg);  */
311                         ;
312                         }
313                         break;
314                 }
315                 case '{':
316                         group_count++;
317                         if (group_count >= group_store ) {
318                                 group_store+=10;
319                                 if((groups=(RTFGroupData*)realloc(groups,
320                                                                                                   group_store*sizeof(RTFGroupData)))
321                                    == NULL ) {
322                                         perror("Can\'t allocate memory: ");
323                                         return 1;
324                                 }
325                         }
326                         if (para_mode)
327                                 add_to_buffer(&bufptr,0x20);
328                         groups[group_count]=groups[group_count-1];
329                         break;
330                 case '}':
331                         group_count--;
332                         if(group_count < 0)
333                                 group_count=0;
334                         if(para_mode > 0 && para_mode > group_count) {
335                                 /*add_to_buffer(&bufptr,0);
336                                 output_paragraph(buffer);
337                                 fprintf(stderr,"\nGROUP_END para_mode=%d group_count=%d bufptr=%d\n", para_mode,group_count,bufptr);
338                                 bufptr=-1;*/
339                                 para_mode=0;
340                         }
341                         if(data_skip_mode > group_count) {
342                                 data_skip_mode=0;
343                         }
344                         break;
345                 default:
346                         if (data_skip_mode == 0)
347                                 if (c != '\n' && c != '\r')
348                                         add_to_buffer(&bufptr,rtf_to_unicode(c));
349                 }
350         }
351         if (bufptr>=0) {
352                 add_to_buffer(&bufptr,'\n');
353                 add_to_buffer(&bufptr,0);
354                 output_paragraph(buffer);
355         }       
356         free(groups);
357         return 0;
358 }  
359
360 /** 
361  * Convert text string to number
362  * 
363  * @param f stream to read data from
364  * 
365  * @return converted number
366  */
367 signed long getNumber(FILE *f) {
368         int c,count=0;
369         char buf[RTFARGSMAXLEN];
370         
371         while(isdigit(c=fgetc(f)) || c=='-') {
372                 if(feof(f))
373                         return -1;
374                 if (count > MAX_DIGITS_IN_NUMBER) 
375                         break;
376                 buf[count++]=(char)c;
377         }
378         ungetc(c,f);
379         buf[count]='\0';
380         return strtol(buf, (char **)NULL, 10);
381 }
382
383 /** 
384  * Parse command stream from rtf file and fill command structure
385  * 
386  * @param f - rtf file stream
387  * @param command - pointer to RTFcommand structure to fill
388  * 
389  * @return parse code not 0 - error, 0 - success
390  */
391 int getRtfCommand(FILE *f, RTFcommand *command ) {
392         int c=fgetc(f);
393         if (isalpha(c)) {
394                 int name_count=1;
395                 command->name[0]=(char)c;
396                 while(isalpha(c=fgetc(f)) && name_count < RTFNAMEMAXLEN) {
397                         if(feof(f))
398                                 return 1;
399                         command->name[name_count++]=(char)c;
400                 }
401                 command->name[name_count]='\0';
402                 command->type=getCommandType(command->name);
403 /*              command->args=NULL; */
404                 ungetc(c,f);
405                 if (isdigit(c) || c == '-' )
406                         command->numarg=getNumber(f);
407                 else
408                         command->numarg=0;
409                 c=fgetc(f);
410                 if(!(c==' ' || c=='\t'))
411                         ungetc(c,f);
412         } else {
413                 command->name[0]=(char)c;
414                 command->name[1]='\0';
415 /*              command->args=NULL; */
416                 if (c == '\'') {
417                         command->type=RTF_CHAR;
418                         command->numarg=getCharCode(f);
419                         if(feof(f))
420                                 return -1;
421                 } else {
422                         command->type=RTF_SPEC_CHAR;
423                         command->numarg=c;
424                 }
425         }
426         
427         return 0;
428 }
429
430 /** 
431  * Converts char to unicode.
432  * 
433  * @param code - integer code of char
434  * 
435  * @return converted char
436  */
437 unsigned short int rtf_to_unicode(int code) {
438         int cc=code;
439         if (code < 0 || (cc=to_unicode(current_charset, code)) < 0 ) return 0xFEFF;
440         return cc;
441 }
442
443 /** 
444  * Convert name of RTF command to RTFType
445  * 
446  * @param name name to convert
447  * 
448  * @return RTFType, if unknown command, then return RTF_UNKNOWN
449  */
450 RTFTypes getCommandType(char *name) {
451         int i, olen=sizeof(rtf_types)/sizeof(RTFTypeMap);
452         for (i = 0; i < olen ; i++) {
453                 if ( strcmp(name,rtf_types[i].name) == 0 ) {
454                         return rtf_types[i].type;
455                 }
456         }
457         return RTF_UNKNOWN;
458 }
459
460 /** 
461  * Return number representing char code in Hex
462  * 
463  * @param f stream to read data from
464  * 
465  * @return converted number
466  */
467 signed int getCharCode(FILE *f) {
468         int c,count=0,i;
469         char buf[RTFARGSMAXLEN];
470         for(i=0;i<2; i++) {
471                 if (isdigit(c=fgetc(f))||(c>='a' && c<='f')) {
472                         if(feof(f))
473                                 return -1;
474                         buf[count++]=(char)c;
475                 } else 
476                         ungetc(c,f);
477         }
478
479         buf[count]='\0';
480         return strtol(buf, (char **)NULL, 16);
481 }
482
483 void rtfSetCharset(short int **charset_ptr,unsigned int codepage)
484 {
485         /* Do not override charset if it is specified in the command line */
486         const char *charset_name;
487         char *save_buf = input_buffer;
488         if (forced_charset) return;
489         charset_name = charset_from_codepage(codepage);
490         check_charset(&source_csname,charset_name);
491         input_buffer=NULL;
492         *charset_ptr = read_charset(source_csname);     
493         input_buffer = save_buf;
494 }