#!/usr/bin/env perl # Use the LaTeX::ToUnicode module (also in the bibtexperllibs # repository/package, like this script) to convert LaTeX to Unicode. # # We work on fragments of text, not whole documents, the goal being to # replace LaTeX commands and syntax with obvious plain text equivalents, # or remove them. use strict; use warnings; use Cwd; use File::Basename; use File::Spec; BEGIN { # find files relative to our installed location within TeX Live chomp(my $TLMaster = `kpsewhich -var-value=SELFAUTOPARENT`); # TL root if (length($TLMaster)) { unshift @INC, "$TLMaster/texmf-dist/scripts/bibtexperllibs"; } # find development bibtexperllibs in sibling checkout to this script, # even if $0 is a symlink. Irrelevant when using from an installation. my $real0 = Cwd::abs_path($0); my $scriptdir = File::Basename::dirname($real0); my $dev_btxperllibs = Cwd::abs_path("$scriptdir/../.."); # we need the lib/ subdirectories inside ... unshift (@INC, glob ("$dev_btxperllibs/*/lib")) if -d $dev_btxperllibs; } use LaTeX::ToUnicode; our %opts; local *OUT; # output filehandle exit(main()); sub main { init(); # by paragraph? while (<>) { print OUT (convert($_)); } return 0; } sub convert { my ($in) = @_; my @args = (); # what we'll pass to the convert() fn. # if (defined(&{"LaTeX_ToUnicode_convert_hook"})) { push (@args, "hook" => \&LaTeX_ToUnicode_convert_hook); } if ($opts{e}) { push (@args, "entities" => 1); } if ($opts{g}) { push (@args, "german" => 1); } if ($opts{h}) { push (@args, "html" => 1); } LaTeX::ToUnicode::debuglevel($opts{v}); my $out = LaTeX::ToUnicode::convert($in, @args); #warn "out=$out"; return $out; } # Command line options, etc. # sub init { my $USAGE = <<END; Usage: $0 [-c CONFIG] [-o OUTPUT] [--html] [...] [INFILE]... Convert the LaTeX source in INFILE (or standard input) to plain text using Unicode code points for accents and other special characters; or, optionally, output HTML with simple translations for font changes and url commands. Common accent sequences, special characters, and simple markup commands are translated, but there is no attempt at completeness. Math, tables, figures, sectioning, etc., are not handled in any way, and mostly left in their TeX form in the output. The translations assume standard LaTeX meanings for characters and control sequences; macros in the input are not considered. The input can be a fragment of text, not a full document, as the purpose of this script was to handle bibliography entries and abstracts (for the ltx2crossrefxml script that is part of the crossrefware package). Patches to extend this script are welcome. It uses the LaTeX::ToUnicode Perl library for the conversion; see its documentation for details. Conversion is currently done line by line, so TeX constructs that cross multiple lines are not handled properly. If it turns out to be useful, conversion could be done by paragraph instead. The config file is read as a Perl source file. It can define a function `LaTeX_ToUnicode_convert_hook()' which will be called early; the value it returns (which must be a string) will then be subject to the standard conversion. For an example of using this script and associated code, see the TUGboat processing at https://github.com/TeXUsersGroup/tugboat/tree/trunk/capsules/crossref. Options: -c, --config=FILE read (Perl) config FILE for a hook, as explained above -e, --entities output entities &#xNNNN; instead of literal characters -g, --german handle some features of the german package -h, --html output simplistic HTML instead of plain text -o, --output=FILE output to FILE instead of stdout -v, --verbose be verbose -V, --version output version information and exit -?, --help display this help and exit Options can be abbreviated unambiguously, and start with either - or --. Dev sources, bug tracker: https://github.com/borisveytsman/bibtexperllibs Releases: https://ctan.org/pkg/bibtexperllibs END my $VERSION = <<END; ltx2unitxt (bibtexperllibs) 0.51 Copyright 2023 Karl Berry. This is free software: you can redistribute it and/or modify it under the same terms as Perl itself. END use Getopt::Long qw(:config no_ignore_case); # otherwise v|V is the same GetOptions( "config|c=s" => \($opts{c}), "entities|e" => \($opts{e}), "german|g" => \($opts{g}), "html|h" => \($opts{h}), "output|o=s" => \($opts{o}), "verbose|v" => \($opts{v}), "version|V" => \($opts{V}), "help|?" => \($opts{help})) || die "Try $0 --help for more information.\n"; if ($opts{help}) { print "$USAGE\n$VERSION"; exit 0; } if ($opts{V}) { print $VERSION; exit 0; } binmode(STDOUT, ":utf8"); *OUT = *STDOUT; if (defined($opts{o})) { open(OUT, ">$opts{o}") || die "open(>$opts{o}) failed: $!\n"; binmode(OUT, ":utf8") } if ($opts{c}) { if (-r $opts{c}) { # if config arg is absolute, fine; if not, prepend "./" as slightly # less troublesome than putting "." in the @INC path. my $rel = (File::Spec->file_name_is_absolute($opts{c}) ? "" : "./"); my $cnffile = "$rel$opts{c}"; verbose("requiring config file: $cnffile"); require $cnffile; } else { die "open config file ($opts{c}) for reading failed: $!\n"; } } } sub verbose { print @_ if $::opts{v}; }