\n\n" ; } # End of Mathematica multiline text. Write it and seal off with # paragraph end delimiter. elsif ( $textState eq "insideMathematicaText" && detectMathematicaMultilineTextEnd( $_ ) ) { $textState = "" ; print NEWFILE "\n
\n\n" ; } # Write intermediate Mathematica text lines to file. elsif ($textState eq "insideMathematicaText") { processMathematicaIntermediateText( $_ ) ; } } # end while FILE # Print the end of the HTML file. printHTMLTrailer( NEWFILE ) ; # Rename the image file directory. rename( $parentDirectoryName . "/" . $oldImageFileDirectory, $parentDirectoryName . "/" . $newImageFileDirectory ) ; # Delete MS Word HTM file. deleteFile( $fileName ) ; # Close both files. close( FILE ) ; close( NEWFILE ) ; print "Done.\n" ; print "Your HTML file is $newFileName\n" ; print "Your renumbered equation .gif images are in $newImageFileDirectory\n" ; #============================================================================ # # NAME # # deleteFile # # # DESCRIPTION # # Delete a file by hook or by crook. # # EXAMPLE # # deleteFiles( "C:/junk.obj" ) ; # #============================================================================ sub deleteFile { my( $fileName ) ; # Define local variables ($fileName) = @_ ; # and split up arguments. # If delete doesn't work try a rename and then delete. if (!unlink( $fileName )) { print "Retrying delete $fileName.\n" ; $tempFileName = "junk" . time . ".txt" ; rename $fileName, $tempFileName ; # Otherwise, just rename the file. if (!unlink( $tempFileName )) { rename $tempFileName, "C:/Temp" . $tempFileName ; print "Can't delete $fileName. " ; print "Renamed it C:/Temp/$tempFileName.\n" ; } } } #============================================================================ # # NAME # # printLegalNotice # # # DESCRIPTION # # Print the legal notice. # # EXAMPLE # # #============================================================================ sub printLegalNotice { $legalNotice = "\n" . "cleanHtml Version 1.3 - Perl utility program which cleans up Microsoft Word\n" . "and Mathematica exported HTML containing mathematical equations.\n" . "Copyright (C) 2002-2009 by Sean Erik O'Connor. All Rights Reserved.\n" . "\n" . "cleanHtml comes with ABSOLUTELY NO WARRANTY; for details see the\n" . "GNU General Public License. This is free software, and you are welcome\n" . "to redistribute it under certain conditions; see the " . "GNU General Public License\n" . "for details.\n\n" ; print $legalNotice ; } #============================================================================ # # NAME # # getUserInput # # # DESCRIPTION # # Prompt the user for information. # # EXAMPLE # # #============================================================================ sub getUserInput { # Enter the parent directory, name of the .htm file from MS Word, the # name of the directory containing the .gif image files, the new # name desired and the starting image number. print "Enter the parent directory containing the files.\n" ; print "(RETURN = current directory)\n" ; print "=> " ; chop( $parentDirectoryName =' ."\n" .
'
' ."\n" .
'
' ."\n" .
' ' ."\n" .
'
' ."\n" .
' home' ."\n" .
' ' ."\n" .
'
' ."\n" . ' Copyright © 2000-2009 by Sean Erik O\'Connor. ' ."\n" . ' All Rights Reserved.' ."\n" . ' ' ."\n" . ' Last updated 00 Nov 3000.' ."\n" . '
' ."\n\n" ; $htmlEnd = '' ."\n" . '' ."\n" ; print $FILEHANDLE $footnote ; print $FILEHANDLE $htmlEnd ; } #============================================================================ # # NAME # # detectMathematicaImageLine # # # DESCRIPTION # # Detect the beginning of a single image line with multiple images as we # see in Mathematica, such as # # Step 2)Check if
#
# Step 2) Check if
# ![[Graphics:Images/math_gr_25.gif]](Images/math_gr_25.gif)
## # EXAMPLE # #============================================================================ sub processMathematicaImageLine { # Define local variables # and split up arguments. my( $line, $parentDirectoryName, $oldImageFileDirectory, $oldImageFileName, $newImageFileName ) ; ( $line, $parentDirectoryName, $oldImageFileDirectory, $oldImageFileName, $newImageFileName ) = @_ ; # This is the beginning of an image specification. $startOfImagePattern = "
//g ;
$text =~ s///g ;
$text =~ s///g ;
$text =~ s///g ;
$text =~ s/<\/PRE>//g ;
$text =~ s/<\/CODE>//g ;
$text =~ s/<\/B>//g ;
$text =~ s/<\/SAMP>//g ;
$text =~ s/ //g ;
# Rename the old image file name to equationNNN.gif
$oldImageFileName = sprintf( $fileNamePrefix . "_gr_%d.gif",
$oldImageNumber ) ;
$newImageFileName = sprintf( "equation%03d.gif",
$newImageNumber ) ;
rename( $parentDirectoryName . "/" . $oldImageFileDirectory . "/" .
$oldImageFileName,
$parentDirectoryName . "/" . $oldImageFileDirectory . "/" .
$newImageFileName ) ;
# Write the last image line.
$equationNumber = sprintf( "%03d", $newImageNumber ) ;
$string = "\n
\n" ;
print NEWFILE $string ;
# Print text if any.
print NEWFILE $text ;
# Step up the image number.
$newImageNumber += $imageNumberIncrement ;
}
# Non-image text. Write it out.
else
{
# Strip off nuisance HTML code.
$image =~ s///g ;
$image =~ s///g ;
$image =~ s///g ;
$image =~ s///g ;
$image =~ s/<\/PRE>//g ;
$image =~ s/<\/CODE>//g ;
$image =~ s/<\/B>//g ;
$image =~ s/<\/SAMP>//g ;
$image =~ s/ //g ;
print NEWFILE $image ;
}
} # end for i
}
#============================================================================
#
# NAME
#
# detectMSWordImageLine
#
#
# DESCRIPTION
#
# Here is a typical MS Word image lines for us to parse:
#
# 
#
# We want to rewrite it as:
#
#
#
# EXAMPLE
#
#
#============================================================================
sub detectMSWordImageLine
{
my( $line ) ; # Define local variables
($line) = @_ ; # and split up arguments.
# Match
height=
return ($line =~ /
image.gif
return( $line =~ /src=".*image[0-9]*.*\.gif/ ) ;
}
#============================================================================
#
# NAME
#
# processMSWordImageEndingLine
#
#
# DESCRIPTION
#
#
# EXAMPLE
#
#
#============================================================================
sub processMSWordImageEndingLine
{
# Define local variables # and split up arguments.
my( $line,
$parentDirectoryName, $oldImageFileDirectory,
$oldImageFileName, $newImageFileName ) ;
( $line,
$parentDirectoryName, $oldImageFileDirectory,
$oldImageFileName, $newImageFileName ) = @_ ;
# Pull out the image number.
$line =~ /src=".*image([0-9]*).*\.gif/ ;
$oldImageNumber = $1 ;
# Rename the old image file name to equationNNN.gif
$oldImageFileName = sprintf( "image%03d.gif",
$oldImageNumber ) ;
$newImageFileName = sprintf( "equation%03d.gif",
$newImageNumber ) ;
rename( $parentDirectoryName . "/" . $oldImageFileDirectory . "/" .
$oldImageFileName,
$parentDirectoryName . "/" . $oldImageFileDirectory . "/" .
$newImageFileName ) ;
# Write the last image line.
$equationNumber = sprintf( "%03d", $newImageNumber ) ;
$string = "\n
\n" ;
print NEWFILE $string ;
# Step up the image number.
$newImageNumber += $imageNumberIncrement ;
}
#============================================================================
#
# NAME
#
# detectMSWordSingleTextLine
#
# DESCRIPTION
#
# For a one line paragraph,
#
# though it converges slowly. There are better formulas.
#
# rewrite as,
#
#
# though it converges slowly. There are better formulas.
#
#
# EXAMPLE
#
#
#============================================================================
sub detectMSWordSingleTextLine
{
my( $line ) ; # Define local variables
($line) = @_ ; # and split up arguments.
# Match ... text ...
return ( $line =~ /.*<\/p>/ ) ;
}
#============================================================================
#
# NAME
#
# processMSWordSingleTextLine
#
# DESCRIPTION
#
#
# EXAMPLE
#
#
#============================================================================
sub processMSWordSingleTextLine
{
my( $line ) ; # Define local variables
($line) = @_ ; # and split up arguments.
# Pull out the text from the HTML syntax.
#
... text ...
$line =~ /(.*)<\/p>/ ;
$lineOfText = $1 ;
# Don't print empty lines,
#
#
if ( !($_ =~ /.* <\/p>/) )
{
print NEWFILE "\n
\n$lineOfText\n
\n" ;
}
}
#============================================================================
#
# NAME
#
# detectMSWordMultilineText
#
# DESCRIPTION
#
#
# EXAMPLE
#
#
#============================================================================
sub detectMSWordMultilineText
{
my( $line ) ;
($line) = @_ ;
# Match a line like
# Since the ring of
return( $line =~ /.*/ && !($line =~ /<\/p>$/) ) ;
}
#============================================================================
#
# NAME
#
# processMSWordMultilineText
#
#
# DESCRIPTION
#
# Here is a typical multiline MS Word text line for us to parse:
#
#
cleanHtml.prl is a Perl utility to process the exported HTML
# files generated by Microsoft Word or Mathematica. It has been specialized to
# format simple text with equations.
#
# We want to rewrite it as:
#
#
# cleanHtml.prl is a Perl utility to process the exported HTML
# files generated by Microsoft Word or Mathematica. It has been
# specialized to format simple text with equations.
#
#
# EXAMPLE
#
#
#============================================================================
sub processMSWordMultilineText
{
my( $line ) ;
($line) = @_ ;
# Pull out the text in the line.
$line =~ /(.*)/ ;
$firstLineOfText = $1 ;
print NEWFILE "\n
\n$firstLineOfText\n" ;
}
#============================================================================
#
# NAME
#
# detectMSWordMultilineTextEnd
#
#
# DESCRIPTION
#
#
# EXAMPLE
#
#
#============================================================================
sub detectMSWordMultilineTextEnd
{
my( $line ) ;
($line) = @_ ;
# Detect ... text ...
return( $line =~ /.*<\/p>$/ ) ;
}
#============================================================================
#
# NAME
#
# processMSWordMultilineTextEnd
#
#
# DESCRIPTION
#
#
# EXAMPLE
#
#
#============================================================================
sub processMSWordMultilineTextEnd
{
my( $line ) ;
($line) = @_ ;
# Pull out the text in the line.
$line =~ /(.*)<\/p>/ ;
$lastLineOfText = $1 ;
print NEWFILE "$lastLineOfText\n
\n" ;
}
#============================================================================
#
# NAME
#
# detectMathematicaMultilineText
#
#
# DESCRIPTION
#
# Handle typical Mathematica multiline text:
#
#
#
# The number of distinct primitive polynomials is
#
#
# EXAMPLE
#
#
#============================================================================
sub detectMathematicaMultilineText
{
my( $line ) ;
($line) = @_ ;
return( $line =~ // ) ;
}
#============================================================================
#
# NAME
#
# detectMathematicaMultilineTextEnd
#
#
# DESCRIPTION
#
#
# EXAMPLE
#
#
#============================================================================
sub detectMathematicaMultilineTextEnd
{
my( $line ) ;
($line) = @_ ;
return ( $line =~ /<\/H\d+>/ ) ;
}
#============================================================================
#
# NAME
#
# processMathematicaIntermediateText
#
#
# DESCRIPTION
#
#
# EXAMPLE
#
#
#============================================================================
sub processMathematicaIntermediateText
{
my( $line ) ;
($line) = @_ ;
# Strip off nuisance HTML syntax.
$line =~ s///g ;
$line =~ s///g ;
$line =~ s///g ;
$line =~ s///g ;
$line =~ s/<\/PRE>//g ;
$line =~ s/<\/CODE>//g ;
$line =~ s/<\/B>//g ;
$line =~ s/<\/SAMP>//g ;
$line =~ s/ //g ;
print NEWFILE "$line" ;
}