-#!/usr/bin/perl
+#!/usr/bin/perl -CDS
+use utf8;
# char-level modes
my $poetry = 0;
my $verbatim = 0;
close F;
my $header =1;
+LINE:
while (<>) {
$environ = undef;
if (/\\(begin|end){(\w+)}/) {
- $environ = $2;
+ $environ=$2;
$begin=$1 eq "begin";
if ($environ eq 'verbatim') {
$verbatim=$begin;
if ($begin) {
pushsection("poem",undef);
} else {
- flushsection('poem');
+ add_to_section(tag($buffer,'stanza')."\n") if $buffer;
+ $buffer="";
+ flushsection('poem');
}
$poetry = $begin;
- } elsif($environ = 'document' && $begin) {
+ } elsif($environ eq 'document' && $begin) {
$header=0;
}
}
next LINE if $header;
-if (/^$/ && $environ && $buffer) {
+if ((/^$/ || $environ) && $buffer) {
#output on empty line (p or stanza) depending on poetry mode
- add_to_section(tag($buffer,$poetry?"stanza":"p"));
+ add_to_section(tag($buffer,$poetry?"stanza":"p")."\n");
$buffer="";
}
next LINE if $environ;
# Section headings
-if (/\\(part|chapter|section|subsection|subsubsection){(.*)}/) {
+if (/\\(part|chapter|section|subsection|subsubsection)\*?{(.*)}/) {
+ if ($buffer) {
+ add_to_section(tag($buffer,$poetry?"stanza":"p"));
+ $buffer="";
+ }
pushsection($1,$2);
+ next LINE;
+}
+if (/\\vspace{/) {
+ add_to_section("<empty-line />");
+ next LINE;
}
+next LINE if /\\pagebreak\b/;
#normal mode:
if (!$verbatim) {
#strip TeX comments
s/([^\\])%.*$/$1/;
+s/^%.*$//;
+# strip \sloppy
+s/\\sloppy\s+//g;
+s/\\sloppy{}//g;
+s/\\sloppy([^\w])/$1/g;
+# strip extra space
+s/^\s+//;
+s/\s+$//;
+s/(\s)\s+/$1/g;
#replace TeX ligatures ~ --- << >> \% with appropriate unicode symbols
s/~/\xA0/g;
s/---/-/g;
s/<</«/g;
s/>>/»/g;
+s/\\%/%/g;
+s/\\dots/\x{2026}/g;
}
#replace ' and " with entities
s/&/&/g;
if ($poetry) {
chomp;
- $buffer.=tag($_,'v');
+ s/\s*\\\\$//;
+ $buffer.=tag($_,'v')."\n";
} elsif ($verbatim) {
add_to_section(tag(tag($_,"code"),"p"));
} else {
- $buffer.=$_;
+ $buffer.=" ".$_;
}
}
if ($buffer) {
add_to_section(tag($buffer,"p"));
+ $buffer="";
}
while (@sections) {
}
$content .= $str->{data};
if ($#sections >=0) {
- add_to_section(tag($content,$tag));
+ add_to_section(tag($content,$tag)."\n");
} else {
print tag($content,$tag);
}
sub tag {
my ($content,$name) = @_;
+ return "" if $content =~ /^\s*$/s;
return "<$name>$content</$name>";
}