1 /*****************************************************************/
2 /* BIFF-stream (excel file) parsing */
4 /* This file is part of catdoc project */
5 /* (c) David Rysdam 1998 */
6 /* (c) Victor Wagner 1998-2003, (c) Alex Ott 2003 */
7 /*****************************************************************/
20 #include "../compat/strftime.h"
22 static unsigned char rec[MAX_MS_RECSIZE];
24 short int *formatTable=NULL;
25 char *forced_date_format = NULL;
26 size_t formatTableIndex = 0;
27 size_t formatTableSize = 0;
28 double date_shift = 25569.0;
29 #define FLT_FORMAT(a,b,c) a #b c
30 #define MK_FORMAT(x) FLT_FORMAT("%.",x,"g")
31 char number_format[8]=MK_FORMAT(DBL_DIG);
33 void CleanUpFormatIdxUsed(void);
35 void do_table(FILE *input,char *filename) {
37 long reclen,build_year=0,build_rel=0,offset=0;
40 date_shift=25569.0; /* Windows 1900 date system */
41 CleanUpFormatIdxUsed();
43 catdoc_read(rec,2,1,input);
44 biff_version=getshort(rec,0);
45 catdoc_read(rec,2,1,input);
46 reclen=getshort(rec,0);
47 if ( biff_version == 0x0809 || biff_version == 0x0409 ||
48 biff_version == 0x0209 || biff_version == 0x0009 ) {
49 if (reclen==8 || reclen==16) {
50 if (biff_version == 0x0809 ) {
51 itemsread=catdoc_read(rec,4,1,input);
52 build_year=getshort(rec+2,0);
53 build_rel=getshort(rec,0);
55 itemsread=catdoc_read(rec,8,1,input);
63 } else if (biff_version == 0x0209 ) {
66 } else if (biff_version == 0x0409 ) {
72 itemsread=catdoc_read(rec,reclen-offset,1,input);
75 fprintf(stderr,"%s: Invalid BOF record\n",filename);
79 itemsread=catdoc_read(rec,126,1,input);
82 if (catdoc_eof(input)) {
83 fprintf(stderr,"%s: No BOF record found\n",filename);
89 itemsread = catdoc_read(buffer, 2, 1, input);
90 if (catdoc_eof(input)) {
91 process_item(MSEOF,0,NULL);
95 rectype=getshort(buffer,0);
100 itemsread = catdoc_read(buffer, 2, 1, input);
101 reclen=getshort(buffer,0);
102 if (reclen && reclen <MAX_MS_RECSIZE &&reclen >0){
103 itemsread = catdoc_read(rec, 1, reclen, input);
107 if (rectype != BOF) {
111 /* fprintf(stderr,"Rectype 0x%04X reclen=%d\n",rectype, reclen); */
112 process_item(rectype,reclen,rec);
113 if (rectype == MSEOF) {
121 unsigned char **sst=NULL;/* Shared string table parsed into array of strings in
123 int sstsize = 0; /*Number of strings in SST*/
124 unsigned char *sstBuffer=NULL; /*Unparsed sst to accumulate all its parts*/
125 int sstBytes = 0; /*Size of SST Data, already accumulated in the buffer */
126 int codepage=1251; /*default*/
128 /* holds a pointer to formula value, becouse value itself would be in
131 unsigned char **saved_reference = NULL;
133 void process_item (int rectype, int reclen, char *rec) {
134 if (rectype != CONTINUE && prev_rectype == SST) {
135 /* we have accumulated unparsed SST, and now encountered
136 * another record, which indicates that SST is ended */
137 /* fprintf(stderr,"parse sst!\n");*/
138 parse_sst(sstBuffer,sstBytes);
142 fprintf(stderr,"File is encrypted\n");
147 fprintf(stderr,"File is write protected\n");
152 if (source_charset) break;
153 codepage=getshort(rec,0);
154 /*fprintf(stderr,"CODEPAGE %d\n",codepage); */
155 if (codepage!=1200) {
156 const char *cp = charset_from_codepage(codepage);
157 source_charset=read_charset(cp);
163 format_code=getshort(rec,0);
164 SetFormatIdxUsed(format_code);
165 /* this debug code prints format string */
169 fprintf(stderr,"Format %x \"",format_code);
170 if (rec[2] == reclen - 3 && rec[3] != 0) {
171 for (i=0,ptr=rec+3;i<rec[2];i++,ptr++) {
175 for (i=0,ptr=rec+5;i<rec[2];i++,ptr+=2) {
179 fprintf (stderr,"\"\n");
184 /* Just copy SST into buffer, and wait until we get
185 * all CONTINUE records
187 /* fprintf(stderr,"SST\n"); */
188 /* If exists first SST entry, then just drop it and start new*/
189 if (sstBuffer != NULL)
194 sstBuffer=(char*)malloc(reclen);
196 if (sstBuffer == NULL ) {
197 perror("SSTptr alloc error! ");
200 memcpy(sstBuffer,rec,reclen);
204 if (prev_rectype != SST) {
205 return; /* to avoid changing of prev_rectype;*/
207 sstBuffer=realloc(sstBuffer,sstBytes+reclen);
208 if (sstBuffer == NULL ) {
209 perror("SSTptr realloc error! ");
212 memcpy(sstBuffer+sstBytes,rec,reclen);
218 unsigned char **pcell;
219 unsigned char *src=(unsigned char *)rec+6;
221 saved_reference=NULL;
222 row = getshort(rec,0);
223 col = getshort(rec,2);
224 /* fprintf(stderr,"LABEL!\n"); */
225 pcell=allocate(row,col);
226 *pcell=copy_unicode_string(&src);
229 case BLANK: { int row,col;unsigned char **pcell;
230 row = getshort(rec,0);
231 col = getshort(rec,2);
232 pcell=allocate(row,col);
237 int row, startcol,endcol;
238 unsigned char **pcell;
239 row = getshort(rec,0);
240 startcol = getshort(rec,2);
241 endcol=getshort(rec,reclen-2);
242 pcell=allocate(row,endcol);
246 case CONSTANT_STRING: {
247 int row = getshort(rec,0);
248 int col = getshort(rec,2);
249 unsigned char **pcell;
250 int string_no=getshort(rec,6);
252 fprintf(stderr,"CONSTANT_STRING before SST parsed\n");
255 /* fprintf(stderr,"col=%d row=%d no=%d\n",col,row,string_no); */
257 saved_reference=NULL;
258 pcell=allocate(row,col);
259 if (string_no>=sstsize|| string_no < 0 ) {
260 fprintf(stderr,"string index out of boundary\n");
262 } else if (sst[string_no] !=NULL) {
265 len=strlen(sst[string_no]);
266 outptr=*pcell=malloc(len+1);
267 strcpy(outptr,sst[string_no]);
279 unsigned char **pcell;
281 saved_reference=NULL;
282 row = getshort(rec,0)-startrow;
283 col = getshort(rec,2);
284 pcell=allocate(row,col);
285 *pcell=strdup(format_double(rec,6,getshort(rec,4)));
290 unsigned char **pcell;
292 row = getshort(rec,0)-startrow;
293 col = getshort(rec,2);
294 pcell=allocate(row,col);
295 *pcell=strdup(format_int(getshort(rec,7),getshort(rec,4)));
300 int row,col,format_code;
301 unsigned char **pcell;
303 saved_reference=NULL;
304 row = getshort(rec,0)-startrow;
305 col = getshort(rec,2);
306 pcell=allocate(row,col);
307 format_code = getshort(rec,4);
308 *pcell=strdup(format_rk(rec+6,format_code));
312 int row,col,startcol,endcol,offset,format_code;
313 unsigned char **pcell;
314 row = getshort(rec,0)-startrow;
315 startcol = getshort(rec,2);
316 endcol = getshort(rec,reclen-2);
317 saved_reference=NULL;
319 for (offset=4,col=startcol;col<=endcol;offset+=6,col++) {
320 pcell=allocate(row,col);
321 format_code=getshort(rec,offset);
322 *pcell=strdup(format_rk(rec+offset+2,format_code));
329 unsigned char **pcell;
330 saved_reference=NULL;
331 row = getshort(rec,0)-startrow;
332 col = getshort(rec,2);
333 pcell=allocate(row,col);
334 if (((unsigned char)rec[12]==0xFF)&&(unsigned char)rec[13]==0xFF) {
335 /* not a floating point value */
341 } else if (rec[6]==2) {
345 } else if (rec[6]==0) {
346 saved_reference=pcell;
349 int format_code=getshort(rec,4);
350 *pcell=strdup(format_double(rec,6,format_code));
355 unsigned char *src=(unsigned char *)rec;
356 if (!saved_reference) {
357 fprintf(stderr,"String record without preceeding string formula\n");
360 *saved_reference=copy_unicode_string(&src);
365 fprintf(stderr,"BOF when current sheet is not flushed\n");
371 case 0x43: /*from perl module Spreadsheet::ParseExecel */
373 short int formatIndex = getshort(rec,2);
374 /* we are interested only in format index here */
375 if (formatTableIndex >= formatTableSize) {
376 formatTable=realloc(formatTable,
377 (formatTableSize+=16)*sizeof(short int));
380 fprintf(stderr,"Out of memory for format table");
384 formatTable[formatTableIndex++] = formatIndex;
387 case MS1904: /* Macintosh 1904 date system */
399 /* fprintf(stderr,"Row! %d %d %d\n",getshort(rec,0), getshort(rec+2,0),getshort(rec+4,0)); */
403 /* fprintf(stderr,"INDEX! %d %d\n", getlong(rec+4,0), getlong(rec+8,0)); */
408 fprintf(stderr,"Unknown record 0x%x\n length %d\n",rectype,reclen);
412 prev_rectype=rectype;
416 * Extracts string from sst and returns mallocked copy of it
418 char *copy_unicode_string (unsigned char **src) {
422 int to_skip=0; /* ÉÓÐÏÌØÚÕÅÔÓÑ ÄÌÑ ÐÏÄÓÞÅÔÁ ÄÌÉÎÙ ÄÁÎÎÙÈ
423 * ÚÁ ËÏÎÃÏÍ ÓÔÒÏËÉ */
424 int offset = 1; /* ÄÌÑ ÕÞÅÔÁ ÐÅÒÅÍÅÎÎÏÊ ÄÌÉÎÙ ÐÅÒ×ÏÇÏ ÐÏÌÑ */
426 /* char *realstart=*src; */
427 char *dest; /* ËÕÄÁ ÂÕÄÅÍ ËÏÐÉÒÏ×ÁÔØ ÓÔÒÏËÕ */
432 /* for(i=0;i<20;i++) */
433 /* fprintf(stderr,"%02x ",(*src)[i]); */
434 /* fprintf(stderr,"\n"); */
436 flags = *(*src+1+offset);
437 if (! ( flags == 0 || flags == 1 || flags == 8 || flags == 9 ||
438 flags == 4 || flags == 5 || flags == 0x0c || flags == 0x0d ) ) {
440 flags = *(*src+offset);
442 flags = *(*src+1+offset);
443 if (! ( flags == 0 || flags == 1 || flags == 8 || flags == 9 ||
444 flags == 4 || flags == 5 || flags == 0x0c || flags == 0x0d ) ) {
445 /* fprintf(stderr,"Strange flags = %d, returning NULL\n", flags); */
450 count=getshort(*src,0);
452 charsize=(flags &0x01) ? 2 : 1;
454 switch (flags & 12 ) {
455 case 0x0c: /* Far East with RichText formating */
456 to_skip=4*getshort(*src,2+offset)+getlong(*src, 4+offset);
457 start_offset=2+offset+2+4;
458 /* fprintf(stderr,"Far East with RichText formating\n"); */
461 case 0x08: /* With RichText formating */
462 to_skip=4*getshort(*src,2+offset);
463 start_offset=2+offset+2;
464 /* fprintf(stderr,"With RichText formating %d\n",getshort(*src,2+offset)); */
467 case 0x04: /* Far East */
468 to_skip=getlong(*src, 2+offset);
469 start_offset=2+offset+4;
470 /* fprintf(stderr,"Far East\n"); */
475 start_offset=2+offset;
476 /* fprintf(stderr,"Default string\n"); */
479 /* fprintf(stderr,"count=%d skip=%d start_offset=%d\n", */
480 /* count, to_skip, start_offset); */
481 /* Á ÚÄÅÓØ ÍÙ ËÏÐÉÒÕÅÍ ÓÔÒÏËÕ */
482 if ( (dest=malloc(count+1)) == NULL ) {
483 perror("Dest string alloc error");
484 *src+=(to_skip+start_offset+(count*charsize));
490 for (s=*src,d=dest,i=0;i<count;i++,s+=charsize) {
491 /* fprintf(stderr,"l=%d len=%d count=%d charsize=%d\n",l,len,count,charsize); */
492 if ( (charsize == 1 && (*s == 1 || *s == 0)) ||
493 (charsize == 2 && (*s == 1 || *s == 0) && *(s+1) != 4)) {
494 /* fprintf(stderr,"extchar (unicode)=%02x %02x\n",*s, *(s+1)); */
495 charsize=(*s &0x01) ? 2 : 1;
501 if ( charsize == 2 ){
502 u=(unsigned short)getshort(s,0);
504 /* fprintf(stderr,"char=%02x %02x\n", *s, *(s+1)); */
506 if (!source_charset) {
507 check_charset(&source_csname,source_csname);
508 /* fprintf(stderr,"charset=%s\n",source_csname);*/
509 source_charset=read_charset(source_csname);
511 u=(unsigned short)to_unicode(source_charset,(unsigned char)*s);
518 dest=realloc(dest,len+1);
532 * Format code is index into format table (which is list of XF records
534 * Second word of XF record is format type idnex
535 * format index between 0x0E and 0x16 also between 0x2D and ox2F denotes
536 * date if it is not used for explicitly stored formats.
537 * BuiltInDateFormatIdx converts format index into index of explicit
538 * built-in date formats sutable for strftime.
540 int BuiltInDateFormatIdx (int index) {
542 offset=1; /* offset of date formats */
543 /* 0 is used as false -- format not found */
544 if ((index>= 0x0E) && (index<=0x16)) {
545 return offset+index-0x0E;
547 if ((index>=0x2d) && (index<=0x2F)) {
548 return offset+index-0x2d+9;
549 } else if (index==0xa4) {
556 * GetBuiltInDateFormat stores and returns
557 * built in xls2csv strftime formats.
559 #define NUMOFDATEFORMATS 13
560 char *GetBuiltInDateFormat(int dateindex) {
561 static char *formats[]={
562 /* reserved */ NULL, /* BuiltInDateFormatIdx use dateindex=0 as flag format not found */
563 /* 0x0E */ "%m-%d-%y", /* 01 */
564 /* 0x0F */ "%d-%b-%y", /* 02 */
565 /* 0x10 */ "%d-%b", /* 03 */
566 /* 0x11 */ "%b-%d", /* 04 */
567 /* 0x12 */ "%l:%M %p", /* 05 */
568 /* 0x13 */ "%l:%M:%S %p", /* 06 */
569 /* 0x14 */ "%H:%M", /* 07 */
570 /* 0x15 */ "%H:%M:%S", /* 08 */
571 /* 0x16 */ "%m-%d-%y %H:%M", /* 09 */
572 /* 0x2d */ "%M:%S", /* 10 */
573 /* 0x2e */ "%H:%M:%S", /* 11 */
574 /* 0x2f */ "%M:%S", /* 12 */
575 /* 0xa4 */ "%m.%d.%Y %l:%M:%S %p" /* 13 */
577 if (dateindex>0 && dateindex <= NUMOFDATEFORMATS) {
578 return formats[dateindex];
583 static char FormatIdxUsed[NUMOFDATEFORMATS];
585 void CleanUpFormatIdxUsed() {
587 for (i=0;i<NUMOFDATEFORMATS; i++);
592 * format index between 0x0E and 0x16 also between 0x2D and ox2F denotes
593 * date in case when they are built-in Excel97 formats.
594 * Nevertheless, those indexes can be used for explicitly stored formats,
595 * which are not dates in general.
596 * SetFormatIdxUsed marks this formats as already used
597 * and excludes them from list of built-in formats
598 * preventing misformatting of corresponding data.
600 void SetFormatIdxUsed(int format_code) {
602 /*fprintf(stderr,"Format idx %x to be set to dirty\n",format_code);
604 dateindex=BuiltInDateFormatIdx(format_code);
606 FormatIdxUsed[dateindex]=1;
607 /*fprintf(stderr,"Date idx %d is set to be dirty\n",dateindex); */
612 * format index between 0x0E and 0x16 also between 0x2D and ox2F denotes
613 * date in case when they are built-in Excel97 formats.
614 * Nevertheless, those indexes can be used for explicitly stored formats,
615 * which are not dates in general.
616 * SetFormatIdxUsed marks this formats as already used
617 * and excludes them from list of built-in formats
618 * preventing misformatting of corresponding data.
619 * IsFormatIdxUsed tests this case.
621 char IsFormatIdxUsed(int format_code) {
623 dateindex=BuiltInDateFormatIdx(format_code);
625 /* fprintf(stderr,"Date idx %d is dirty\n",dateindex); */
626 return FormatIdxUsed[dateindex]==1;
632 /* Checks if format denoted by given code is date
633 * Format code is index into format table (which is list of XF records
635 * Second word of XF record is format type inex
636 * format index between 0x0E and 0x16 also between 0x2D and ox2F denotes
638 * If so, it returns strftime format for this date. Otherwise returns
643 char *isDateFormat(int format_code) {
646 if (format_code>=formatTableIndex) {
647 fprintf(stderr,"Format code %d is used before definition\n",format_code);
651 index = formatTable[format_code];
652 if (IsFormatIdxUsed(index)) {
653 fprintf(stderr,"Format %x is redefined\n",index);
654 /* this format is something user-defined --- not a standard built-in date*/
657 dateindex=BuiltInDateFormatIdx(index);
659 if (forced_date_format) return forced_date_format;
660 return GetBuiltInDateFormat(dateindex);
667 time_t float2date(double d);
669 * Extracts floating point value and formats it
672 char *number2string(double d,short int format_code) {
673 static char buffer [128];
675 if ((datefmt=isDateFormat(format_code))!=NULL) {
676 time_t t = float2date(d);
677 strftime(buffer, 127,datefmt, gmtime(&t));
679 sprintf(buffer,number_format,d);
684 char *format_double(char *rec,int offset,int format_code) {
689 # ifdef WORDS_BIGENDIAN
690 for(s=rec+offset+8,d=dconv.cc,i=0;
691 i<8;i++) *(d++)=*(--s);
693 for(s=rec+offset,d=dconv.cc,i=0;
694 i<8;i++) *(d++)=*(s++);
696 return number2string(dconv.d,format_code);
700 * Formats integer value into static buffer
702 char *format_int(int value,int format_code) {
703 static char buffer[12];
704 sprintf(buffer,"%i",value);
710 char* format_rk(char *rec,short int format_code) {
716 value=(double)(getlong(rec,0)>>2);
724 # ifdef WORDS_BIGENDIAN
725 for(s=rec+4,d=dconv.cc,i=0; i<4;i++)
727 dconv.cc[0]=dconv.cc[0] & 0xfc;
729 for(s=rec,d=dconv.cc+4,i=0;
730 i<4;i++) *(d++)=*(s++);
731 dconv.cc[3]=dconv.cc[3] & 0xfc;
737 return number2string(value,format_code);
742 * Converts excel date into time_t
744 time_t float2date(double f) {
745 /* Hacked version. Excell stores date as floating point count of days
746 * since 1.1.1900. or 1.1.1904
747 * We are substracting value of 1.1.1970 and multiplying
748 * by 86400 thus getting seconds from the epoch
750 return rint((f-date_shift)*86400);
754 * Parses SST into array of strings
756 void parse_sst(char *sstbuf,int bufsize) {
757 int i; /* index into sst */
758 unsigned char *curString; /* pointer into unparsed buffer*/
759 unsigned char *barrier=(unsigned char *)sstbuf+bufsize; /*pointer to end of buffer*/
760 unsigned char **parsedString;/*pointer into parsed array*/
762 sstsize = getlong(sstbuf+4,0);
763 sst=malloc(sstsize*sizeof(char *));
766 perror("SST allocation error");
769 memset(sst,0,sstsize*sizeof(char *));
770 for (i=0,parsedString=sst,curString=sstbuf+8;
771 i<sstsize && curString<barrier; i++,parsedString++) {
772 /* fprintf(stderr,"copying %d string\n",i); */
773 *parsedString = copy_unicode_string(&curString);
775 /* fprintf(stderr,"end sst i=%d sstsize=%d\n",i,sstsize); */