Dependencies:
- simple-scan
- tesseract-svn
- imagemagick
- leptonica
- pdftk
- exactimage
- my bash script
Step 1: Use simple-scan and scan everything to a multipage PDF file.
Step 2: Apply my script to the .pdf file.
DONE
#!/bin/bash
LANG=”deu”
FILE=$1
TEMPS=`echo $FILE | md5sum | cut -c 1-32 `
mkdir /tmp/${TEMPS}
cd /tmp/${TEMPS}echo “split pages”
pdftk $FILE burst dont_ask output ${TEMPS}_%02d.pdffor f in ${TEMPS}_*.pdf
do
f=`basename $f .pdf`
echo “Converting ${f}.pdf to tif”
convert -quiet -density 600 -size 210 × 297 -depth 8 “${f}.pdf” “${f}.tif”
convert -quiet -density 150 -size 210 × 297 -depth 8 “${f}.pdf” “${f}.jpg”
donefor f in ${TEMPS}_*.tif
do
f=`basename $f .tif`
echo “Using tesseract on ${f}.tif …”
tesseract “${f}.tif” $f -l $LANG hocr
donefor f in ${TEMPS}_*.pdf
do
echo “adding text to pdf”
f=`basename $f .pdf`
hocr2pdf -n -o ${f}_n.pdf -i ${f}.tif < ${f}.html
pdftk ${f}.pdf background ${f}_n.pdf output ${f}_added.pdf
doneecho "Joining Multipagepdf"
pdftk ${TEMPS}_*_added.pdf cat output "${TEMPS}-ocr.pdf"dirname=`dirname $FILE`
filename=`basename $FILE .pdf`echo "writing new pdf file"
mv ${TEMPS}-ocr.pdf ${dirname}/${filename}_ocr.pdfecho "deleting temp files"
rm -rf /tmp/${TEMPS}
PS: This should work on Ubuntu too


