/**
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*
* Copyright 2007, 2008
* @author Florian Hackenberger
* @author Kenneth Berland
*/
package com.acoveo.hocrtopdf;
import java.io.*;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import au.id.jericho.lib.html.Source;
import au.id.jericho.lib.html.StartTag;
import au.id.jericho.lib.html.StartTagType;
import au.id.jericho.lib.html.HTMLElementName;
import com.lowagie.text.Document;
import com.lowagie.text.DocumentException;
import com.lowagie.text.Font;
import com.lowagie.text.FontFactory;
import com.lowagie.text.Image;
import com.lowagie.text.Rectangle;
import com.lowagie.text.pdf.CMYKColor;
import com.lowagie.text.pdf.PdfContentByte;
import com.lowagie.text.pdf.PdfWriter;
/** A quickhack for converting from hOCR to PDF
* @author fhackenberger
*/
/** Equally quick hack for multi-page hOCR
* @author Kenneth Berland
*/
public class HocrToPdf {
/**
* @param args
*/
public static void main(String[] args) {
try {
if(args.length < 1 || args[0] == "--help" || args[0] == "-h") {
System.out.print(
"Usage: java com.acoveo.hocrtopdf.HocrToPdf INPUTURL.html OUTPUTURL.pdf [input image dpi]\n" +
"\n" +
"Converts hOCR files into PDF\n" +
"\n" +
"Example: java com.acoveo.hocrtopdf.HocrToPdf file:///home/username/hocr.html ./output.pdf\n");
if(args.length < 1)
System.exit(-1);
else
System.exit(0);
}
URL inputHOCRFile = null;
FileOutputStream outputPDFStream = null;
String basePath = null;
try {
inputHOCRFile = new URL(args[0]);
//File MyFile = new File(inputHOCRFile.getPath());
//System.out.println( MyFile.getParent() );
//File MyFile = new File(inputHOCRFile.getPath());
basePath = (new File(inputHOCRFile.getPath())).getParent();
System.err.println("image base path is: " + basePath );
} catch (MalformedURLException e) {
System.out.println("The first parameter has to be a valid URL");
System.exit(-1);
}
try {
outputPDFStream = new FileOutputStream(args[1]);
} catch (FileNotFoundException e) {
System.out.println("The second parameter has to be a valid URL");
System.exit(-1);
}
Document pdfDocument = null;
PdfContentByte cb = null;
int pageCount = 1;
float pageImagePixelHeight = 0; // set to zero to surpress error
float dotsPerPointX = 0;
float dotsPerPointY = 0;
PdfWriter pdfWriter = null;
Font defaultFont = FontFactory.getFont(FontFactory.HELVETICA, 8, Font.BOLD, CMYKColor.BLACK);
// Using the jericho library to parse the HTML file
Source source=new Source(inputHOCRFile);
StartTag divTag = source.findNextStartTag(0, HTMLElementName.DIV, StartTagType.NORMAL);
while (divTag != null ){
System.out.println("rendering page: " + pageCount);
// Find the tag of class ocr_page in order to load the scanned image
//System.out.println("div tag start/end: " + divTag.getBegin() + ":" + divTag.getEnd() );
Pattern imagePattern = Pattern.compile("image\\s+([^;]+)");
Matcher imageMatcher = imagePattern.matcher(divTag.getElement().getAttributeValue("title"));
if(!imageMatcher.find()) {
System.out.println("Could not find a tag of class \"ocr_page\", aborting.");
System.exit(-1);
}
// Load the image
Image pageImage = null;
try {
pageImage = Image.getInstance(new URL("file://" + basePath + "/" + imageMatcher.group(1)));
} catch (MalformedURLException e) {
System.exit(-1);
}
// Open our output PDF with specs from page 1
// TODO this might suck
// The resolution of a PDF file (using iText) is 72pt per inch
if ( pageCount == 1 ){
float pointsPerInch = 72.0f;
System.out.println(pageImage.getDpiX() + ":" + pageImage.getDpiY() );
dotsPerPointX = pageImage.getDpiX() / pointsPerInch;
dotsPerPointY = pageImage.getDpiY() / pointsPerInch;
if (args.length >2 ){
dotsPerPointX = Integer.valueOf(args[2]) / pointsPerInch;
dotsPerPointY = Integer.valueOf(args[2]) / pointsPerInch;
}
pageImagePixelHeight = pageImage.getHeight();
pdfDocument = new Document(new Rectangle(pageImage.getWidth() / dotsPerPointX, pageImage.getHeight() / dotsPerPointY));
System.out.println(pageImage.getWidth() + ":" + pageImage.getHeight() );
System.out.println(dotsPerPointX + ":" + dotsPerPointY );
pdfWriter = PdfWriter.getInstance(pdfDocument, outputPDFStream);
pdfDocument.open();
// Put the text behind the picture (reverse for debugging)
cb = pdfWriter.getDirectContentUnder();
//PdfContentByte cb = pdfWriter.getDirectContent();
}
pdfDocument.newPage();
pageImage.scaleToFit(pageImage.getWidth() / dotsPerPointX, pageImage.getHeight() / dotsPerPointY);
pageImage.setAbsolutePosition(0, 0);
// Put the image in front of the text (reverse for debugging)
pdfWriter.getDirectContent().addImage(pageImage);
//pdfWriter.getDirectContentUnder().addImage(pageImage);
// In order to place text behind the recognised text snippets we are interested in the bbox property
Pattern bboxPattern = Pattern.compile("bbox(\\s+\\d+){4}");
// This pattern separates the coordinates of the bbox property
Pattern bboxCoordinatePattern = Pattern.compile("(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)");
// Only tags of the ocr_line class are interesting
StartTag ocrLineTag = source.findNextStartTag(divTag.getEnd(), "class", "ocr_line", false);
//System.out.println("check: " + ocrLineTag.getBegin() + " < " + divTag.getElement().getEndTag().getEnd() );
while(ocrLineTag != null && ocrLineTag.getBegin() < divTag.getElement().getEndTag().getEnd() ) {
au.id.jericho.lib.html.Element lineElement = ocrLineTag.getElement();
Matcher bboxMatcher = bboxPattern.matcher(lineElement.getAttributeValue("title"));
if(bboxMatcher.find()) {
// We found a tag of the ocr_line class containing a bbox property
Matcher bboxCoordinateMatcher = bboxCoordinatePattern.matcher(bboxMatcher.group());
bboxCoordinateMatcher.find();
int[] coordinates = {Integer.parseInt((bboxCoordinateMatcher.group(1))),
Integer.parseInt((bboxCoordinateMatcher.group(2))),
Integer.parseInt((bboxCoordinateMatcher.group(3))),
Integer.parseInt((bboxCoordinateMatcher.group(4)))};
String line = lineElement.getContent().extractText();
float bboxWidthPt = (coordinates[2] - coordinates[0]) / dotsPerPointX;
float bboxHeightPt = (coordinates[3] - coordinates[1]) / dotsPerPointY;
// Put the text into the PDF
cb.beginText();
// Comment the next line to debug the PDF output (visible Text)
cb.setTextRenderingMode(PdfContentByte.TEXT_RENDER_MODE_INVISIBLE);
// TODO: Scale the text width to fit the OCR bbox
cb.setFontAndSize(defaultFont.getBaseFont(), Math.round(bboxHeightPt));
cb.moveText((float)(coordinates[0] / dotsPerPointX), (float)((pageImagePixelHeight - coordinates[3]) / dotsPerPointY));
cb.showText(line);
cb.endText();
}
ocrLineTag = source.findNextStartTag(ocrLineTag.getEnd(), "class", "ocr_line", false);
}
divTag = source.findNextStartTag(divTag.getEnd(), HTMLElementName.DIV, StartTagType.NORMAL);
pageCount++;
}
pdfDocument.close();
} catch (DocumentException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}