文档有些生词不认识,能划词翻译就很方便。
但是有些文档,看着是一个词,但是划词的时候就是划不全,比如Preface, 划词只能划到Pref,强行划整个的话,结果是
Pref ace,很难受啊。
研究了好几天,废寝忘食,终于给我实验出来了。
下面贴代码,最后附上前后pdf。
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.*;
import java.net.URLEncoder;
import java.util.List;
public class Fixer extends PDFTextStripper {
public Fixer() throws IOException {
}
public static void main(String[] args) throws IOException {
String srcFile = "C:\\Users\\tanmingxin\\Desktop\\go\\The.Go.Programming.Language.pdf";
String dstFile = "C:\\Users\\tanmingxin\\Desktop\\go\\The.Go.Programming.Language-fixed.pdf";
doIt(srcFile, dstFile);
}
TextPosition lastPositon = null;
StringWriter dummy = new StringWriter();
public static void doIt( String inputFile, String outputFile)
throws IOException {
Fixer fixer = new Fixer();
try (PDDocument document = Loader.loadPDF(new File(inputFile)); PDDocument dstdocument = new PDDocument();)
{
PDFont font = PDType0Font.load(dstdocument, new File("c:/windows/fonts/times.ttf"));
fixer.setSortByPosition( true );
for (int i = 1; i <= document.getNumberOfPages(); i++) {
System.out.println("---------------------------");
System.out.println("第" + i + "页");
fixer.setStartPage( i );
fixer.setEndPage( i);
StringWriter dummy = new StringWriter();
fixer.writeText(document, dummy);
if (fixer.lastPositon != null) {
fixer.dummy.append(fixer.lastPositon.getUnicode());
fixer.lastPositon = null;
}
String text = fixer.dummy.toString();
fixer.dummy.close();
fixer.dummy = new StringWriter();
PDPage my_page = new PDPage();
dstdocument.addPage(my_page);
PDPageContentStream contentStream = new PDPageContentStream(dstdocument, my_page);
contentStream.setFont( font, 12);
int y = 760;
String[] ss = text.split("\n");
for (String s: ss) {
contentStream.beginText();
contentStream.newLineAtOffset(25, y);
contentStream.showText(s);
contentStream.newLine();
y -= 16;
contentStream.newLineAtOffset(25, y);
contentStream.endText();
}
contentStream.close();
}
dstdocument.save(outputFile);
}
fixer.dummy.close();
}
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException
{
for (TextPosition text : textPositions)
{
// System.out.println( "String[" + text.getXDirAdj() + "," +
// text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale=" +
// text.getXScale() + " height=" + text.getHeightDir() + " space=" +
// text.getWidthOfSpace() + " width=" +
// text.getWidthDirAdj() + "]" + text.getUnicode() );
if (lastPositon != null) {
float xx = text.getX() - lastPositon.getX();
if (xx > 0) {
float abs = Math.abs(xx);
if (abs > lastPositon.getWidthOfSpace() / 2) {
if (abs > lastPositon.getWidth() + lastPositon.getWidthOfSpace()/ 2) {
dummy.append(lastPositon.getUnicode());
dummy.append(" ");
} else {
dummy.append(lastPositon.getUnicode());
}
}
} else {
dummy.append(lastPositon.getUnicode());
dummy.append("\n");
}
}
lastPositon = text;
}
}
}
pdf效果地址
链接:https://pan.baidu.com/s/10v3_XcszZa4tI-BHNI2RKA
提取码:4art