#!/usr/bin/perl #============================================================================ # # NAME # # cleanHtml.prl # # # DESCRIPTION # # Clean up the exported HTML files generated by Microsoft Word # or Mathematica. It has been specialized to format simple text # with equations. Put all equations on lines by themselves. # # EXAMPLE # # (1) Edit equationExample.doc with Microsoft Word, # # (2) Select "File->Save As Web Page". Select Web Page, Filtered. # Save as equationExample.htm # You will also get a directory of images called # equationExample_files # # (3) From the command line, run # $ perl cleanHtml.prl # # cleanHtml Version 1.3 - Perl utility program which cleans up # Microsoft Word # and Mathematica exported HTML containing mathematical equations. # Copyright (C) 2002-2008 by Sean Erik O'Connor. All Rights # Reserved. # # cleanHtml comes with ABSOLUTELY NO WARRANTY; for details see the # GNU General Public License. This is free software, and you are # welcome to redistribute it under certain conditions; see the GNU # General Public License for details. # # Enter the parent directory containing the files. # (RETURN = current directory). # => # Enter the name of the .HTM file # (You can leave off the .htm extension). # => equationExample # Enter the directory containing the .gif image files # (RETURN = equationExample_files). # => # Enter the new image file directory name (RETURN = WebPageImages). # => equationExampleImages # Enter the starting image number (RETURN = 1). # => # Enter the image numbering increment (RETURN = 2): # => # Done. # Your HTML file is ./equationExample.html # Your renumbered equation .gif images are in equationExampleImages # # (1) Edit equationExample.nb with Mathematica # (2) Select "File->Save As Special->HTML" # In "Save As Filename" use the name, equationExample.htm # You will get a directory of images called Images. # # (3) From the command line, run # $ perl cleanHtml.prl # # ... # Enter the directory containing the .gif image files # (RETURN = equationExample_files): # => Images # ... # # (4) Using PaintShopPro or another image converter, batch # convert the *.gif formulas to *.png files. # # (5) Run cleanWebSite.prl which will convert the src links to each # *.gif equation in the *.html file to point to the *.png image # equations. # # # DEBUGGING # # perl -d cleanWebPage.prl Turn on the Perl debugger. # DB> b 225 Break at specific line. # DB> c Continue. # DB> n Step to next line. # DB> s Step into subroutine. # DB> p $_ Print a line. # DB> p $line =~ s/\.html/ ; Print 1 if we get a match. # # # AUTHOR # # Sean E. O'Connor 5 Jun 2002 Version 1.0 released. # 10 Jun 2002 Version 1.1 released. # 27 Apr 2006 Version 1.3 released. # # LEGAL # # cleanHtml Version 1.3 # # A Perl utility program which cleans up Microsoft Word exported HTML # containing mathematical equations. # # Copyright (C) 2002-2008 by Sean Erik O'Connor. All Rights Reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; version 2 # of the License. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # The author's address is artifex@seanerikoconnor.freeservers.com. # #=========================================================================== # Print the legal notice first. printLegalNotice() ; # Read arguments from the command line. if (@ARGV == 6) { $parentDirectoryName = $ARGV[ 0 ] ; $htmFileName = $ARGV[ 1 ] ; $oldImageDirectory = $ARGV[ 2 ] ; $newImageDirectory = $ARGV[ 3 ] ; $startingImageNumber = $ARGV[ 4 ] ; $imageNumberIncrement = $ARGV[ 5 ] ; } # Or else prompt for arguments. elsif (@ARGV == 0) { ($parentDirectoryName, $htmFileName, $oldImageDirectory, $newImageDirectory, $startingImageNumber, $imageNumberIncrement) = getUserInput() ; } else { print "Usage:\n" ; print "perl cleanHtml \n" ; print " \n" ; print " \n" ; print " \n" ; print " \n" ; print " \n" ; } # Add directory paths to the file names. $fileName = $parentDirectoryName . "/" . $htmFileName ; # Create a new scratch file for writing, of same name but # a .html extension. $newFileName = $fileName ; $newFileName =~ s/\.htm/\.html/ ; # Open the input file for reading and the output file for writing. open( FILE, $fileName ) || die( "Cannot open file $fileName" ) ; open( NEWFILE, ">$newFileName" ) || die( "Cannot open file $newFileName" ) ; # Print a custom HTML header with style sheet linkage. printHTMLHeader( NEWFILE ) ; # Set starting states. $textState = "" ; $imageState = "" ; $newImageNumber = $startingImageNumber ; # Scan through each line of the file, # copying lines to the output file. while ( ) { # Process a single Mathematica image line which may be inside # a Mathematica text section and skip to the next line. if ( ($imageState eq "" || $textState eq "insideMathematicaText") && detectMathematicaImageLine( $_ ) ) { processMathematicaImageLine( $_, $parentDirectoryName, $oldImageFileDirectory, $oldImageFileName, $newImageFileName ) ; next ; } # Process the first line of an MS Word image specification. if ( detectMSWordImageLine( $_ ) ) { $imageState = "insideImage" ; processMSWordImageLine( $_ ) ; } # Process the second (ending) line of an MS Word image specification. elsif ($imageState eq "insideImage" && detectMSWordImageEndingLine( $_ )) { $imageState = "" ; processMSWordImageEndingLine( $_, $parentDirectoryName, $oldImageFileDirectory, $oldImageFileName, $newImageFileName ) ; } # Process the beginning of a single MS Word text line. # Don't process a text line with an image in it. if ( $textState eq "" && detectMSWordSingleTextLine( $_ ) && !detectMSWordImageLine( $_ ) ) # Image line can resemble text. { processMSWordSingleTextLine( $_ ) ; } # Process the beginning of a multiple MS Word text line. # Don't process a text line with an image in it. elsif ( $textState eq "" && detectMSWordMultilineText( $_ ) && !detectMSWordImageLine( $_ ) ) # Image line can resemble text. { $textState = "insideText" ; processMSWordMultilineText( $_ ) ; } # End of a multiple MS Word text line. Write it and seal off with # paragraph end delimiter. elsif ( $textState eq "insideText" && detectMSWordMultilineTextEnd( $_ ) ) { $textState = "" ; processMSWordMultilineTextEnd( $_ ) ; } # Write intermediate MS Word text lines to file. elsif ($textState eq "insideText") { print NEWFILE $_ ; } # Beginning of Mathematica multiline text. Emit paragraph begin delimiter. if ( $textState eq "" && detectMathematicaMultilineText( $_ ) ) { $textState = "insideMathematicaText" ; print NEWFILE "\n

\n\n" ; } # End of Mathematica multiline text. Write it and seal off with # paragraph end delimiter. elsif ( $textState eq "insideMathematicaText" && detectMathematicaMultilineTextEnd( $_ ) ) { $textState = "" ; print NEWFILE "\n

\n\n" ; } # Write intermediate Mathematica text lines to file. elsif ($textState eq "insideMathematicaText") { processMathematicaIntermediateText( $_ ) ; } } # end while FILE # Print the end of the HTML file. printHTMLTrailer( NEWFILE ) ; # Rename the image file directory. rename( $parentDirectoryName . "/" . $oldImageFileDirectory, $parentDirectoryName . "/" . $newImageFileDirectory ) ; # Delete MS Word HTM file. deleteFile( $fileName ) ; # Close both files. close( FILE ) ; close( NEWFILE ) ; print "Done.\n" ; print "Your HTML file is $newFileName\n" ; print "Your renumbered equation .gif images are in $newImageFileDirectory\n" ; #============================================================================ # # NAME # # deleteFile # # # DESCRIPTION # # Delete a file by hook or by crook. # # EXAMPLE # # deleteFiles( "C:/junk.obj" ) ; # #============================================================================ sub deleteFile { my( $fileName ) ; # Define local variables ($fileName) = @_ ; # and split up arguments. # If delete doesn't work try a rename and then delete. if (!unlink( $fileName )) { print "Retrying delete $fileName.\n" ; $tempFileName = "junk" . time . ".txt" ; rename $fileName, $tempFileName ; # Otherwise, just rename the file. if (!unlink( $tempFileName )) { rename $tempFileName, "C:/Temp" . $tempFileName ; print "Can't delete $fileName. " ; print "Renamed it C:/Temp/$tempFileName.\n" ; } } } #============================================================================ # # NAME # # printLegalNotice # # # DESCRIPTION # # Print the legal notice. # # EXAMPLE # # #============================================================================ sub printLegalNotice { $legalNotice = "\n" . "cleanHtml Version 1.3 - Perl utility program which cleans up Microsoft Word\n" . "and Mathematica exported HTML containing mathematical equations.\n" . "Copyright (C) 2002-2008 by Sean Erik O'Connor. All Rights Reserved.\n" . "\n" . "cleanHtml comes with ABSOLUTELY NO WARRANTY; for details see the\n" . "GNU General Public License. This is free software, and you are welcome\n" . "to redistribute it under certain conditions; see the " . "GNU General Public License\n" . "for details.\n\n" ; print $legalNotice ; } #============================================================================ # # NAME # # getUserInput # # # DESCRIPTION # # Prompt the user for information. # # EXAMPLE # # #============================================================================ sub getUserInput { # Enter the parent directory, name of the .htm file from MS Word, the # name of the directory containing the .gif image files, the new # name desired and the starting image number. print "Enter the parent directory containing the files.\n" ; print "(RETURN = current directory)\n" ; print "=> " ; chop( $parentDirectoryName = ) ; if ($parentDirectoryName eq "") # Use the current directory, if none is given. { $parentDirectoryName = "." ; } print "Enter the name of the .HTM file\n" ; print "You can leave off the .htm extension\n" ; print "=> " ; chop( $htmFileName = ) ; if ( !($htmFileName =~ /\.htm/) ) { $htmFileName = $htmFileName . ".htm" ; } # Strip off the .htm, if any. $temp = $htmFileName ; $temp =~ s/\.htm//g ; print "Enter the directory containing the .gif image files.\n" ; print "RETURN = " . $temp . "_files\n" ; print "=> " ; chop( $oldImageFileDirectory = ) ; if ($oldImageFileDirectory eq "") # Use the MS Word default image file directory. { $oldImageFileDirectory = $htmFileName ; $oldImageFileDirectory =~ s/\.htm/_files/ ; } print "Enter the new image file directory name.\n" ; print "RETURN = WebPageImages\n" ; print "=> " ; chop( $newImageFileDirectory = ) ; if ($newImageFileDirectory eq "") # Default { $newImageFileDirectory = "WebPageImages" ; } print "Enter the starting image number\n" ; print "RETURN = 1\n" ; print "=> " ; chop( $startingImageNumber = ) ; if ($startingImageNumber eq "") { $startingImageNumber = 1 ; } print "Enter the image numbering increment.\n" ; print "RETURN = 2\n" ; print "=> " ; chop( $imageNumberIncrement = ) ; if ($imageNumberIncrement eq "") { $imageNumberIncrement = 2 ; } return ( $parentDirectoryName, $htmFileName, $oldImageDirectory, $newImageDirectory, $startingImageNumber, $imageNumberIncrement ) ; } #============================================================================ # # NAME # # printHTMLHeader # # # DESCRIPTION # # Print my technical report XHTML 1.0 header. # # EXAMPLE # # #============================================================================ # Print an HTML header with style sheet linkage. sub printHTMLHeader { my( $FILEHANDLE ) ; # Define local variables ($FILEHANDLE) = @_ ; # and split up arguments. $xmlHeader = '' . "\n\n" ; $commentHeader = "\n\n" ; $xmlHeader2 = '' . "\n\n" ; $documentHead = '' . "\n" . ' Document title goes here.' ."\n\n" . ' ' ."\n\n" . ' ' ."\n\n" . ' ' ."\n\n" . ' ' ."\n\n" . ' ' ."\n\n" . ' ' . "\n\n" . ' ' ."\n" . ' ' ."\n" . '' ."\n\n" ; $bodyLine = '' ."\n\n" ; print $FILEHANDLE $xmlHeader ; print $FILEHANDLE $commentHeader ; print $FILEHANDLE $xmlHeader2 ; print $FILEHANDLE $documentHead ; print $FILEHANDLE $bodyLine ; } #============================================================================ # # NAME # # printHTMLTrailer # # # DESCRIPTION # # Print my technical report XHTML 1.0 trailer. # # EXAMPLE # # #============================================================================ sub printHTMLTrailer { my( $FILEHANDLE ) ; # Define local variables ($FILEHANDLE) = @_ ; # and split up arguments. $footnote = ' ' ."\n" . '

' ."\n" . ' Blue Bar Separator.' ."\n" . '

' ."\n\n" . ' ' ."\n" . '

' ."\n" . ' ' ."\n" . ' Home Page Button.' ."\n" . ' home' ."\n" . ' ' ."\n" . '

' ."\n\n" . '

' ."\n" . ' Copyright © 2000-2008 by Sean Erik O\'Connor. ' ."\n" . ' All Rights Reserved.' ."\n" . '    ' ."\n" . ' Last updated 00 Nov 3000.' ."\n" . '

' ."\n\n" ; $htmlEnd = '' ."\n" . '' ."\n" ; print $FILEHANDLE $footnote ; print $FILEHANDLE $htmlEnd ; } #============================================================================ # # NAME # # detectMathematicaImageLine # # # DESCRIPTION # # Detect the beginning of a single image line with multiple images as we # see in Mathematica, such as # # Step 2)Check if[Graphics:Images/mathematica_gr_27.gif]... # # EXAMPLE # # #============================================================================ sub detectMathematicaImageLine { my( $line ) ; # Define local variables ($line) = @_ ; # and split up arguments. return ($line =~ / # # Step 2)  Check if   # [Graphics:Images/math_gr_27.gif] # [Graphics:Images/math_gr_28.gif] # is a primitive root of p. # [Graphics:Images/math_gr_29.gif] # # # #
[Graphics:Images/math_gr_25.gif]

#        [Graphics:Images/math_gr_26.gif]
# # EXAMPLE # #============================================================================ sub processMathematicaImageLine { # Define local variables # and split up arguments. my( $line, $parentDirectoryName, $oldImageFileDirectory, $oldImageFileName, $newImageFileName ) ; ( $line, $parentDirectoryName, $oldImageFileDirectory, $oldImageFileName, $newImageFileName ) = @_ ; # This is the beginning of an image specification. $startOfImagePattern = "... section to the end of the line. if ( $image =~ /"Images\/(.*?)_gr_([0-9]+)\.gif.*WIDTH="([0-9]+)"\s+HEIGHT="([0-9]+)/ ) { # Extract the file name prefix. $fileNamePrefix = $1 ; # Pull out the image number. $oldImageNumber = $2 ; # Pull out the image width and height. $imageWidth = $3 ; $imageHeight = $4 ; # Pull out any text near the end. $image =~ /ALIGN="absmiddle"\s+>(.*)$/ ; $text = $1 ; # Strip off nuisance HTML code. $text =~ s/
//g ;
            $text =~ s///g ;
            $text =~ s///g ;
            $text =~ s///g ;
            $text =~ s/<\/PRE>//g ;
            $text =~ s/<\/CODE>//g ;
            $text =~ s/<\/B>//g ;
            $text =~ s/<\/SAMP>//g ;
            $text =~ s/ //g ;



            # Rename the old image file name to equationNNN.gif

            $oldImageFileName = sprintf( $fileNamePrefix . "_gr_%d.gif", 
                             $oldImageNumber ) ;
            $newImageFileName = sprintf( "equation%03d.gif", 
                                         $newImageNumber ) ;

            rename( $parentDirectoryName . "/" . $oldImageFileDirectory . "/" . 
                    $oldImageFileName, 
                    $parentDirectoryName . "/" . $oldImageFileDirectory . "/" . 
                    $newImageFileName ) ;



            # Write the last image line.

            $equationNumber = sprintf( "%03d", $newImageNumber ) ;
            $string = "\n\n" ;

            print NEWFILE $string ;



            # Print text if any.

            print NEWFILE $text ;

        
            # Step up the image number.

            $newImageNumber += $imageNumberIncrement ;
        } 
        # Non-image text.  Write it out.
        else
        {
            #  Strip off nuisance HTML code.

            $image =~ s/
//g ;
            $image =~ s///g ;
            $image =~ s///g ;
            $image =~ s///g ;
            $image =~ s/<\/PRE>//g ;
            $image =~ s/<\/CODE>//g ;
            $image =~ s/<\/B>//g ;
            $image =~ s/<\/SAMP>//g ;
            $image =~ s/ //g ;

            print NEWFILE $image ;
        }

    } # end for i
}




#============================================================================
#
# NAME
#
#     detectMSWordImageLine
#
#
# DESCRIPTION
#
#     Here is a typical MS Word image lines for us to parse:
#
#         

# # We want to rewrite it as: # # equation072. # # EXAMPLE # # #============================================================================ sub detectMSWordImageLine { my( $line ) ; # Define local variables ($line) = @_ ; # and split up arguments. # Match height= return ($line =~ /image.gif return( $line =~ /src=".*image[0-9]*.*\.gif/ ) ; } #============================================================================ # # NAME # # processMSWordImageEndingLine # # # DESCRIPTION # # # EXAMPLE # # #============================================================================ sub processMSWordImageEndingLine { # Define local variables # and split up arguments. my( $line, $parentDirectoryName, $oldImageFileDirectory, $oldImageFileName, $newImageFileName ) ; ( $line, $parentDirectoryName, $oldImageFileDirectory, $oldImageFileName, $newImageFileName ) = @_ ; # Pull out the image number. $line =~ /src=".*image([0-9]*).*\.gif/ ; $oldImageNumber = $1 ; # Rename the old image file name to equationNNN.gif $oldImageFileName = sprintf( "image%03d.gif", $oldImageNumber ) ; $newImageFileName = sprintf( "equation%03d.gif", $newImageNumber ) ; rename( $parentDirectoryName . "/" . $oldImageFileDirectory . "/" . $oldImageFileName, $parentDirectoryName . "/" . $oldImageFileDirectory . "/" . $newImageFileName ) ; # Write the last image line. $equationNumber = sprintf( "%03d", $newImageNumber ) ; $string = "\n\n" ; print NEWFILE $string ; # Step up the image number. $newImageNumber += $imageNumberIncrement ; } #============================================================================ # # NAME # # detectMSWordSingleTextLine # # DESCRIPTION # # For a one line paragraph, # #

though it converges slowly.  There are better formulas.

# # rewrite as, # #

# though it converges slowly.  There are better formulas. #

# # EXAMPLE # # #============================================================================ sub detectMSWordSingleTextLine { my( $line ) ; # Define local variables ($line) = @_ ; # and split up arguments. # Match

... text ...

return ( $line =~ /

.*<\/p>/ ) ; } #============================================================================ # # NAME # # processMSWordSingleTextLine # # DESCRIPTION # # # EXAMPLE # # #============================================================================ sub processMSWordSingleTextLine { my( $line ) ; # Define local variables ($line) = @_ ; # and split up arguments. # Pull out the text from the HTML syntax. #

... text ...

$line =~ /

(.*)<\/p>/ ; $lineOfText = $1 ; # Don't print empty lines, #

 

#

 

if ( !($_ =~ /

.* <\/p>/) ) { print NEWFILE "\n

\n$lineOfText\n

\n" ; } } #============================================================================ # # NAME # # detectMSWordMultilineText # # DESCRIPTION # # # EXAMPLE # # #============================================================================ sub detectMSWordMultilineText { my( $line ) ; ($line) = @_ ; # Match a line like #

Since the ring of return( $line =~ /

.*/ && !($line =~ /<\/p>$/) ) ; } #============================================================================ # # NAME # # processMSWordMultilineText # # # DESCRIPTION # # Here is a typical multiline MS Word text line for us to parse: # #

cleanHtml.prl is a Perl utility to process the exported HTML # files generated by Microsoft Word or Mathematica.  It has been specialized to # format simple text with equations.

# # We want to rewrite it as: # #

# cleanHtml.prl is a Perl utility to process the exported HTML # files generated by Microsoft Word or Mathematica.  It has been # specialized to format simple text with equations. #

# # EXAMPLE # # #============================================================================ sub processMSWordMultilineText { my( $line ) ; ($line) = @_ ; # Pull out the text in the line. $line =~ /

(.*)/ ; $firstLineOfText = $1 ; print NEWFILE "\n

\n$firstLineOfText\n" ; } #============================================================================ # # NAME # # detectMSWordMultilineTextEnd # # # DESCRIPTION # # # EXAMPLE # # #============================================================================ sub detectMSWordMultilineTextEnd { my( $line ) ; ($line) = @_ ; # Detect ... text ...

return( $line =~ /.*<\/p>$/ ) ; } #============================================================================ # # NAME # # processMSWordMultilineTextEnd # # # DESCRIPTION # # # EXAMPLE # # #============================================================================ sub processMSWordMultilineTextEnd { my( $line ) ; ($line) = @_ ; # Pull out the text in the line. $line =~ /(.*)<\/p>/ ; $lastLineOfText = $1 ; print NEWFILE "$lastLineOfText\n

\n" ; } #============================================================================ # # NAME # # detectMathematicaMultilineText # # # DESCRIPTION # # Handle typical Mathematica multiline text: # #

# # The number of distinct primitive polynomials is #

# # EXAMPLE # # #============================================================================ sub detectMathematicaMultilineText { my( $line ) ; ($line) = @_ ; return( $line =~ // ) ; } #============================================================================ # # NAME # # detectMathematicaMultilineTextEnd # # # DESCRIPTION # # # EXAMPLE # # #============================================================================ sub detectMathematicaMultilineTextEnd { my( $line ) ; ($line) = @_ ; return ( $line =~ /<\/H\d+>/ ) ; } #============================================================================ # # NAME # # processMathematicaIntermediateText # # # DESCRIPTION # # # EXAMPLE # # #============================================================================ sub processMathematicaIntermediateText { my( $line ) ; ($line) = @_ ; # Strip off nuisance HTML syntax. $line =~ s/
//g ;
    $line =~ s///g ;
    $line =~ s///g ;
    $line =~ s///g ;
    $line =~ s/<\/PRE>//g ;
    $line =~ s/<\/CODE>//g ;
    $line =~ s/<\/B>//g ;
    $line =~ s/<\/SAMP>//g ;
    $line =~ s/ //g ;

    print NEWFILE "$line" ;
}