]> www.wagner.pp.ru Git - oss/catdoc.git/blob - src/rtfread.c
Recreated CVS repository from working copy
[oss/catdoc.git] / src / rtfread.c
1 /*****************************************************************/
2 /* Reading routines for rtf files                                */
3 /*                                                               */
4 /* This file is part of catdoc project                           */
5 /* (c) Victor Wagner 2003, (c) Alex Ott 2003                 */
6 /*****************************************************************/
7 #ifdef HAVE_CONFIG_H
8 #include <config.h>
9 #endif
10
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <ctype.h>
14 #include <string.h>
15 #include "catdoc.h"
16
17 /********************************************************
18  * Datatypes declaration
19  * 
20  */
21 typedef enum {
22         RTF_CODEPAGE,
23         RTF_FONT_CHARSET,
24         RTF_UC,
25         RTF_UNICODE_CHAR,
26         RTF_CHAR,
27         RTF_PARA,
28         RTF_TABLE_START,
29         RTF_TABLE_END,
30         RTF_ROW,
31         RTF_CELL,
32         RTF_UNKNOWN,
33         RTF_OVERLAY,
34         RTF_PICT,
35         RTF_F,
36         RTF_AUTHOR,
37         RTF_FONTTBL,
38         RTF_INFO,
39         RTF_STYLESHEET,
40         RTF_COLORTBL,
41         RTF_LISTOVERRIDETABLE,
42         RTF_LISTTABLE,
43         RTF_RSIDTBL,
44         RTF_GENERATOR,
45         RTF_DATAFIELD,
46         RTF_LANG,
47         RTF_PARD,
48         RTF_TAB,
49         RTF_SPEC_CHAR,
50         RTF_EMDASH,
51         RTF_ENDASH,
52         RTF_EMSPACE,
53         RTF_ENSPACE,
54         RTF_BULLET, 
55         RTF_LQUOTE,
56         RTF_RQUOTE,
57         RTF_LDBLQUOTE,
58         RTF_RDBLQUOTE,
59         RTF_ZWNONJOINER,
60 } RTFTypes;
61
62 typedef struct {
63         char *name;
64         RTFTypes type;
65 } RTFTypeMap;
66
67 RTFTypeMap rtf_types[]={
68         {"uc",RTF_UC},
69         {"ansicpg",RTF_CODEPAGE},
70         {"pard",RTF_PARD},
71         {"par",RTF_PARA},
72         {"cell",RTF_CELL},
73         {"row",RTF_ROW},
74         {"overlay",RTF_OVERLAY}, 
75         {"pict",RTF_PICT},
76         {"author",RTF_AUTHOR},
77         {"f",RTF_F}, 
78         {"fonttbl",RTF_FONTTBL}, 
79         {"info",RTF_INFO}, 
80         {"stylesheet",RTF_STYLESHEET},
81         {"colortbl",RTF_COLORTBL},
82         {"listtable",RTF_LISTTABLE},
83         {"listoverridetable",RTF_LISTOVERRIDETABLE},
84         {"rsidtbl",RTF_RSIDTBL}, 
85         {"generator",RTF_GENERATOR}, 
86         {"datafield",RTF_DATAFIELD}, 
87         {"lang",RTF_LANG}, 
88         {"tab",RTF_TAB}, 
89         {"emdash",RTF_EMDASH},
90         {"endash",RTF_ENDASH},
91         {"emspace",RTF_EMDASH},
92         {"enspace",RTF_ENDASH},
93         {"bullet",RTF_BULLET}, 
94         {"lquote",RTF_LQUOTE},
95         {"rquote",RTF_RQUOTE},
96         {"ldblquote",RTF_LDBLQUOTE},
97         {"rdblquote",RTF_RDBLQUOTE},
98         {"zwnj",RTF_ZWNONJOINER},
99 /*      {"",}, */
100 /*      {"",}, */
101         {"u",RTF_UNICODE_CHAR}
102 };
103
104 #define RTFNAMEMAXLEN 32
105 #define RTFARGSMAXLEN 64
106
107 /**
108  * Structure describing rtf command
109  * 
110  */
111 typedef struct {
112         RTFTypes type;
113         char name[RTFNAMEMAXLEN+1];
114         signed int numarg;
115 /*      void *args; */
116 } RTFcommand;
117
118
119 #define MAXFONTNAME 64
120 /**
121  * 
122  * 
123  */
124 typedef struct {
125         int name;
126         char fontname[MAXFONTNAME+1];
127 } RTFFont;
128
129 /**
130  * Structure to describe style
131  * 
132  */
133 typedef struct {
134         int codepage;
135 } RTFStyle;
136
137 /**
138  * Structure to store values, local to rtf group
139  * 
140  */
141 typedef struct {
142         int uc;                                         /**< How much symbols to skip */
143         RTFStyle* style;                        /**< curren style */
144 } RTFGroupData;
145
146 /********************************************************
147  * Functions declaration
148  * 
149  */
150
151 extern int forced_charset;
152 signed long getNumber(FILE *f);
153
154 int getRtfCommand(FILE *f, RTFcommand *command );
155 unsigned short int rtf_to_unicode(int code);
156 RTFTypes getCommandType(char *name);
157 signed int getCharCode(FILE *f);
158 void rtfSetCharset(short int **charset_ptr,unsigned int codepage);
159
160 /********************************************************
161  * Global data
162  * 
163  */
164 short int *current_charset;
165 int rtf_level=0;
166
167 /********************************************************
168  * Functions implementation
169  * 
170  */
171 extern unsigned short int buffer[];
172 void add_to_buffer(int *bufptr,unsigned short int c) {
173         buffer[++(*bufptr)]=c;
174         if (*bufptr > PARAGRAPH_BUFFER-2) {
175                 buffer[++(*bufptr)]=0;
176                 output_paragraph(buffer);
177                 *bufptr=-1;
178         }
179 }
180
181 void end_paragraph(int *bufptr) {
182                                    add_to_buffer(bufptr,0x000a);
183                                    add_to_buffer(bufptr,0);
184                                    output_paragraph(buffer);
185                                    *bufptr=-1;
186 }                                  
187
188 /** 
189  * Parses RTF file from file stream
190  * 
191  * @param f - file stream descriptor
192  */
193 int parse_rtf(FILE *f) {
194         int para_mode=0, data_skip_mode=0,i;
195         RTFGroupData *groups=NULL;
196         int group_count=0, group_store=20;
197         int bufptr=-1;
198         current_charset=source_charset;
199         fseek(f,0,SEEK_SET);
200         if((groups=(RTFGroupData*)calloc(group_store,sizeof(RTFGroupData))) == NULL ) {
201                 perror("Can\'t allocate memory: ");
202                 return 1;
203         }
204         groups[0].uc = 2; /* DEfault uc = 2 */
205         while ( !feof(f) ) {
206                 int c = fgetc(f);
207                 if ( feof( f ) )
208                         break;
209                 switch (c) {
210                 case '\\': {
211                         int code;
212                         RTFcommand com;
213                         if ((code=getRtfCommand(f, &com)) != 0)
214                                 break;
215                         switch (com.type) {
216                         case RTF_SPEC_CHAR:
217 /*                              fprintf(stderr, "Spec Char found=%s and arg=%c\n", */
218 /*                              com.name, com.numarg); */
219                                 if (com.numarg == '*' && data_skip_mode == 0) {
220                                         data_skip_mode=group_count;
221                                 } else if (com.numarg == '\r') {
222                                         end_paragraph(&bufptr);
223                                 } else if (com.numarg == '~') {
224                                         add_to_buffer(&bufptr,0xA0);/* NO-BREAK SPACE */
225                                 } else if (com.numarg == '-') {
226                                         add_to_buffer(&bufptr,0xAD);/* Optional hyphen */
227                                 }       
228
229                                    break;
230                         case RTF_EMDASH:
231                                    add_to_buffer(&bufptr,0x2014);/* EM DASH*/
232                                    break;
233                         case RTF_ENDASH: 
234                                    add_to_buffer(&bufptr,0x2013);break;
235                         case RTF_BULLET: 
236                                    add_to_buffer(&bufptr,0x2022);break;
237                         case RTF_LQUOTE: add_to_buffer(&bufptr,0x2018);break;
238                         case RTF_RQUOTE: add_to_buffer(&bufptr,0x2019);break;
239                         case RTF_LDBLQUOTE: add_to_buffer(&bufptr,0x201C);break;
240                         case RTF_RDBLQUOTE: add_to_buffer(&bufptr,0x201D);break;
241                         case RTF_ZWNONJOINER: add_to_buffer(&bufptr,0xfeff);break;
242                         case RTF_EMSPACE:
243                         case RTF_ENSPACE:
244                                         add_to_buffer(&bufptr,' ');break;
245                         case RTF_CHAR:
246 /*                              fprintf(stderr, "RTF char %d\n", com.numarg); */
247                                 if (data_skip_mode == 0) {
248                                         add_to_buffer(&bufptr,rtf_to_unicode(com.numarg));
249                                 }       
250                                 break;
251                         case RTF_UC:
252                                 groups[group_count].uc=com.numarg;
253                                 break;
254                         case RTF_TAB:
255                                 add_to_buffer(&bufptr,0x0009);
256                                 break;
257                         case RTF_UNICODE_CHAR:
258                                 if (com.numarg < 0)
259                                         break;
260 /*                              fprintf(stderr, "Unicode char %d\n", com.numarg);  */
261                                 if (data_skip_mode == 0)
262                                         add_to_buffer(&bufptr,com.numarg);
263                                 i=groups[group_count].uc;
264                                 while((--i)>0)
265                                         fgetc(f);
266                                 break;
267                         case RTF_PARA:
268                                 /*if (para_mode > 0) {*/
269                                         end_paragraph(&bufptr); 
270                                 /*}*/   
271                                 para_mode=group_count;
272                                 break;
273                         case RTF_PICT:
274                         case RTF_FONTTBL:
275                         case RTF_INFO:
276                         case RTF_COLORTBL:
277                         case RTF_STYLESHEET:
278                         case RTF_LISTTABLE:
279                         case RTF_LISTOVERRIDETABLE:
280                         case RTF_RSIDTBL:
281                         case RTF_GENERATOR:
282                         case RTF_DATAFIELD:
283                                 if (data_skip_mode == 0){
284                                         data_skip_mode=group_count;
285                                 }
286                                 break;
287                         case RTF_LANG:
288 /*                              fprintf(stderr, "Selected lang = %d\n",com.numarg); */
289                                 break;
290                         case RTF_CODEPAGE:
291                                 rtfSetCharset(&current_charset,com.numarg);
292                         default:
293 /*                              fprintf(stderr, "Unknown command with name %s and arg=%d\n",  */
294 /*                                              com.name, com.numarg);  */
295                         ;
296                         }
297                         break;
298                 }
299                 case '{':
300                         group_count++;
301                         if (group_count >= group_store ) {
302                                 group_store+=10;
303                                 if((groups=(RTFGroupData*)realloc(groups,
304                                                                                                   group_store*sizeof(RTFGroupData)))
305                                    == NULL ) {
306                                         perror("Can\'t allocate memory: ");
307                                         return 1;
308                                 }
309                         }
310                         if (para_mode)
311                                 add_to_buffer(&bufptr,0x20);
312                         groups[group_count]=groups[group_count-1];
313                         break;
314                 case '}':
315                         group_count--;
316                         if(group_count < 0)
317                                 group_count=0;
318                         if(para_mode > 0 && para_mode > group_count) {
319                                 /*add_to_buffer(&bufptr,0);
320                                 output_paragraph(buffer);
321                                 fprintf(stderr,"\nGROUP_END para_mode=%d group_count=%d bufptr=%d\n", para_mode,group_count,bufptr);
322                                 bufptr=-1;*/
323                                 para_mode=0;
324                         }
325                         if(data_skip_mode > group_count) {
326                                 data_skip_mode=0;
327                         }
328                         break;
329                 default:
330                         if (data_skip_mode == 0)
331                                 if (c != '\n' && c != '\r')
332                                         add_to_buffer(&bufptr,rtf_to_unicode(c));
333                 }
334         }
335         if (bufptr>=0) {
336                 add_to_buffer(&bufptr,'\n');
337                 add_to_buffer(&bufptr,0);
338                 output_paragraph(buffer);
339         }       
340         free(groups);
341         return 0;
342 }  
343
344 /** 
345  * Convert text string to number
346  * 
347  * @param f stream to read data from
348  * 
349  * @return converted number
350  */
351 signed long getNumber(FILE *f) {
352         int c,count=0;
353         char buf[RTFARGSMAXLEN];
354         
355         while(isdigit(c=fgetc(f)) || c=='-') {
356                 if(feof(f))
357                         return -1;
358                 buf[count++]=(char)c;
359         }
360         ungetc(c,f);
361         buf[count]='\0';
362         return strtol(buf, (char **)NULL, 10);
363 }
364
365 /** 
366  * Parse command stream from rtf file and fill command structure
367  * 
368  * @param f - rtf file stream
369  * @param command - pointer to RTFcommand structure to fill
370  * 
371  * @return parse code not 0 - error, 0 - success
372  */
373 int getRtfCommand(FILE *f, RTFcommand *command ) {
374         int c=fgetc(f);
375         if (isalpha(c)) {
376                 int name_count=1;
377                 command->name[0]=(char)c;
378                 while(isalpha(c=fgetc(f)) && name_count < RTFNAMEMAXLEN) {
379                         if(feof(f))
380                                 return 1;
381                         command->name[name_count++]=(char)c;
382                 }
383                 command->name[name_count]='\0';
384                 command->type=getCommandType(command->name);
385 /*              command->args=NULL; */
386                 ungetc(c,f);
387                 if (isdigit(c) || c == '-' )
388                         command->numarg=getNumber(f);
389                 else
390                         command->numarg=0;
391                 c=fgetc(f);
392                 if(!(c==' ' || c=='\t'))
393                         ungetc(c,f);
394         } else {
395                 command->name[0]=(char)c;
396                 command->name[1]='\0';
397 /*              command->args=NULL; */
398                 if (c == '\'') {
399                         command->type=RTF_CHAR;
400                         command->numarg=getCharCode(f);
401                         if(feof(f))
402                                 return -1;
403                 } else {
404                         command->type=RTF_SPEC_CHAR;
405                         command->numarg=c;
406                 }
407         }
408         
409         return 0;
410 }
411
412 /** 
413  * Converts char to unicode.
414  * 
415  * @param code - integer code of char
416  * 
417  * @return converted char
418  */
419 unsigned short int rtf_to_unicode(int code) {
420         int cc=code;
421         if (code < 0 || (cc=to_unicode(current_charset, code)) < 0 ) return 0xFEFF;
422         return cc;
423 }
424
425 /** 
426  * Convert name of RTF command to RTFType
427  * 
428  * @param name name to convert
429  * 
430  * @return RTFType, if unknown command, then return RTF_UNKNOWN
431  */
432 RTFTypes getCommandType(char *name) {
433         int i, olen=sizeof(rtf_types)/sizeof(RTFTypeMap);
434         for (i = 0; i < olen ; i++) {
435                 if ( strcmp(name,rtf_types[i].name) == 0 ) {
436                         return rtf_types[i].type;
437                 }
438         }
439         return RTF_UNKNOWN;
440 }
441
442 /** 
443  * Return number representing char code in Hex
444  * 
445  * @param f stream to read data from
446  * 
447  * @return converted number
448  */
449 signed int getCharCode(FILE *f) {
450         int c,count=0,i;
451         char buf[RTFARGSMAXLEN];
452         for(i=0;i<2; i++) {
453                 if (isdigit(c=fgetc(f))||(c>='a' && c<='f')) {
454                         if(feof(f))
455                                 return -1;
456                         buf[count++]=(char)c;
457                 } else 
458                         ungetc(c,f);
459         }
460
461         buf[count]='\0';
462         return strtol(buf, (char **)NULL, 16);
463 }
464
465 void rtfSetCharset(short int **charset_ptr,unsigned int codepage)
466 {
467         /* Do not override charset if it is specified in the command line */
468         const char *charset_name;
469         char *save_buf = input_buffer;
470         if (forced_charset) return;
471         charset_name = charset_from_codepage(codepage);
472         check_charset(&source_csname,charset_name);
473         input_buffer=NULL;
474         *charset_ptr = read_charset(source_csname);     
475         input_buffer = save_buf;
476 }