pdfBox 读取pdf文件

1、引入maven依赖

        <dependency>
          <groupId>org.apache.pdfbox</groupId>
          <artifactId>pdfbox</artifactId>
          <version>2.0.4</version>
        </dependency>

2、相关工具类:PdfParser.java

package com.insurance.tool;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.text.PDFTextStripper;

import com.insurance.pojo.Insurance;
import com.insurance.pojo.InsuranceOrder;
import com.insurance.pojo.InsuranceProgram;

public class PdfParser {

    public static void main(String[] args) {
        readPDF("C:\\Users\\yinz\\Desktop\\场景1\\场景1_样例_电子保单识别.pdf");
    }

    public static List<InsuranceOrder> readPDF(InputStream stream) throws Exception{
        List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
        PDDocument document = null;
        document=PDDocument.load(stream);

        // 获取页码
        int pages = document.getNumberOfPages();

        // 读文本内容
        PDFTextStripper stripper=new PDFTextStripper();
        // 设置按顺序输出
        stripper.setSortByPosition(true);
        /*stripper.setStartPage(1);
        stripper.setEndPage(pages);
        String content = stripper.getText(document);
        System.out.println(content);*/     

        for(int page = 1; page <= pages; page++) {
            stripper.setStartPage(page);
            stripper.setEndPage(page);
            String content = stripper.getText(document);
            //System.out.println(content);
            parseContent(content, orderList);
        }

        System.out.println(orderList);
        return orderList;
    }

    public static void readPDF(String filePath) {
        List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
        File pdfFile = new File(filePath);
        PDDocument document = null;
        try
        {
            document=PDDocument.load(pdfFile);

            // 获取页码
            int pages = document.getNumberOfPages();

            // 读文本内容
            PDFTextStripper stripper=new PDFTextStripper();
            // 设置按顺序输出
            stripper.setSortByPosition(true);
            /*stripper.setStartPage(1);
            stripper.setEndPage(pages);
            String content = stripper.getText(document);
            System.out.println(content);*/     

            for(int page = 1; page <= pages; page++) {
                stripper.setStartPage(page);
                stripper.setEndPage(page);
                String content = stripper.getText(document);
                //System.out.println(content);
                parseContent(content, orderList);
            }
            System.out.println(orderList);
        }
        catch(Exception e)
        {
            System.out.println(e);
        }

        }

    private static Pattern insurancePoliceNoP = Pattern.compile("保险单号\\s(.*?)\\s");
    private static Pattern insuranceApplicationNoP = Pattern.compile("投保单号\\s(.*?)\\s");
    private static Pattern policeHolderP = Pattern.compile("投 保 人.*\r\n");
    private static Pattern insuredP = Pattern.compile("被保险人.*\r\n");
    private static Pattern insuredAgeP = Pattern.compile("被保险人投保年龄\\s(.*?)(\r\n|\\s)");
    private static Pattern beneficiaryP = Pattern.compile("身故受益人及分配方式\\s(.*?)(\r\n|\\s)");
    private static Pattern insuranceNameP = Pattern.compile("险种名称及款式\\s(.*?)(\r\n|\\s)");
    private static Pattern validPeriodP = Pattern.compile("保险期间\\s(.*?)\\s合同生效日", Pattern.DOTALL);
    private static Pattern effectiveDateP = Pattern.compile("合同生效日\\s(.*?)(\r\n|\\s)");
    private static Pattern chargeWayP = Pattern.compile("交费方式\\s(.*?)\\s");
    private static Pattern feeP = Pattern.compile("保 险 费\\s(.*?)(\r\n|\\s)");
    private static Pattern policeHolderCount = Pattern.compile("投保份数\\s(.*?)(\r\n|\\s)");
    private static Pattern programListP = Pattern.compile("保险金额(.*?)保险责任与责任免除详见条款", Pattern.DOTALL);
    /*private static Pattern validPeriodP = Pattern.compile("保险期间\\s(.*?)\\s");
    private static Pattern effectiveDateP = Pattern.compile("合同生效日\\s(.*?)\\s");*/
    private static void parseContent(String content, List<InsuranceOrder> list) {
        if(content == null || content.trim().length() == 0) {
            return;
        }
        if(content.startsWith("个 人 人 身 保 险 保 险 单")) {
            //个人信息
            InsuranceOrder order = new InsuranceOrder();
            String insurancePoliceNo = retriveText(content, insurancePoliceNoP, 1);
            if(insurancePoliceNo == null || insurancePoliceNo.length() <= 0) {
                return;
            }
            list.add(order);
            order.setInsurancePoliceNo(insurancePoliceNo);
            order.setInsuranceApplicationNo(retriveText(content, insuranceApplicationNoP, 1));

            String policeHolderInfo = retriveTextWithInnnerBlank(content, policeHolderP, 0);
            if(policeHolderInfo != null) {
                Pattern policeHolderNameP = Pattern.compile("投 保 人(.*?)性别");
                Pattern policeHolderGenderP = Pattern.compile("性别(.*?)出生日期");
                Pattern policeHolderBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
                Pattern policeHolderIDP = Pattern.compile("证件号码(.*)$");

                order.setPoliceHolderName(retriveText(policeHolderInfo, policeHolderNameP, 1));
                order.setPoliceHolderGender(retriveText(policeHolderInfo, policeHolderGenderP, 1));
                order.setPoliceHolderBirthday(retriveText(policeHolderInfo, policeHolderBirthdayP, 1));
                order.setPoliceHolderID(retriveText(policeHolderInfo, policeHolderIDP, 1));
            }
            String insuredInfo = retriveTextWithInnnerBlank(content, insuredP, 0);
            if(insuredInfo != null) {
                Pattern insuredNameP = Pattern.compile("被保险人(.*?)性别");
                Pattern insuredGenderP = Pattern.compile("性别(.*?)出生日期");
                Pattern insuredBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
                Pattern insuredIDP = Pattern.compile("证件号码(.*)$");

                order.setInsuredName(retriveText(insuredInfo, insuredNameP, 1));
                order.setInsuredGender(retriveText(insuredInfo, insuredGenderP, 1));
                order.setInsuredBirthday(retriveText(insuredInfo, insuredBirthdayP, 1));
                order.setInsuredID(retriveText(insuredInfo, insuredIDP, 1));
            }
            order.setInsuredAge(retriveText(content, insuredAgeP, 1));
            order.setBeneficiary(retriveText(content, beneficiaryP, 1));

            //保险信息
            Insurance insurance = new Insurance();
            order.setInsurance(insurance);
            insurance.setName(retriveText(content, insuranceNameP, 1));
            insurance.setValidPeriod(retriveText(content, validPeriodP, 1).replaceAll("\r\n", ""));
            insurance.setEffectiveDate(retriveText(content, effectiveDateP, 1));
            insurance.setChargeWay(retriveText(content, chargeWayP, 1));
            insurance.setFee(retriveText(content, feeP, 1));
            insurance.setPoliceHolderCount(retriveText(content, policeHolderCount, 1));

            //保险项目信息
            String programList = retriveTextWithInnnerBlank(content, programListP, 1);
            if(programList != null) {
                String[] pArr = programList.split("\r\n");
                for(String str : pArr) {
                    if(str != null && str.trim().length() > 0) {
                        String[] subArr = str.split(" ");
                        InsuranceProgram program = new InsuranceProgram();
                        order.getProgramList().add(program);
                        program.setName(subArr[0]);
                        program.setFee(subArr[1]);
                    }
                }
            }
        }
    }

    private static String retriveText(String content, Pattern p, int position) {
        Matcher m = p.matcher(content);
        if(m.find()) {
            return m.group(position).trim().replace(" ", "");
        }
        return "";
    }

    private static String retriveTextWithInnnerBlank(String content, Pattern p, int position) {
        Matcher m = p.matcher(content);
        if(m.find()) {
            return m.group(position).trim();
        }
        return "";
    }
}

相关实体类:InsuranceOrder .java

package com.insurance.pojo;

import java.util.ArrayList;
import java.util.List;

public class InsuranceOrder {

    private String insurancePoliceNo;  //保险单号
    private String insuranceApplicationNo;  //投保单号
    private String policeHolderName;  //  投保人
    private String policeHolderBirthday; //投保人出生日期
    private String policeHolderGender;  //投保人性别
    private String policeHolderID;  //  投保人证件号码
    private String insuredName;  //被保险人
    private String insuredGender;  //被保险人性别
    private String insuredBirthday; //被保险人出生日期
    private String insuredID;  //被保险人证件号
    private String insuredAge;  //被保险人投保年龄
    private String beneficiary;  //身故受益人及分配方式

    private Insurance insurance; //险种
    private List<InsuranceProgram> programList = new ArrayList<InsuranceProgram>();  //保险项目

    public String getPoliceHolderBirthday() {
        return policeHolderBirthday;
    }
    public void setPoliceHolderBirthday(String policeHolderBirthday) {
        this.policeHolderBirthday = policeHolderBirthday;
    }
    public String getInsuredBirthday() {
        return insuredBirthday;
    }
    public void setInsuredBirthday(String insuredBirthday) {
        this.insuredBirthday = insuredBirthday;
    }
    public String getInsurancePoliceNo() {
        return insurancePoliceNo;
    }
    public void setInsurancePoliceNo(String insurancePoliceNo) {
        this.insurancePoliceNo = insurancePoliceNo;
    }
    public String getInsuranceApplicationNo() {
        return insuranceApplicationNo;
    }
    public void setInsuranceApplicationNo(String insuranceApplicationNo) {
        this.insuranceApplicationNo = insuranceApplicationNo;
    }
    public String getPoliceHolderName() {
        return policeHolderName;
    }
    public void setPoliceHolderName(String policeHolderName) {
        this.policeHolderName = policeHolderName;
    }
    public String getPoliceHolderGender() {
        return policeHolderGender;
    }
    public void setPoliceHolderGender(String policeHolderGender) {
        this.policeHolderGender = policeHolderGender;
    }
    public String getPoliceHolderID() {
        return policeHolderID;
    }
    public void setPoliceHolderID(String policeHolderID) {
        this.policeHolderID = policeHolderID;
    }
    public String getInsuredName() {
        return insuredName;
    }
    public void setInsuredName(String insuredName) {
        this.insuredName = insuredName;
    }
    public String getInsuredGender() {
        return insuredGender;
    }
    public void setInsuredGender(String insuredGender) {
        this.insuredGender = insuredGender;
    }
    public String getInsuredID() {
        return insuredID;
    }
    public void setInsuredID(String insuredID) {
        this.insuredID = insuredID;
    }
    public String getInsuredAge() {
        return insuredAge;
    }
    public void setInsuredAge(String insuredAge) {
        this.insuredAge = insuredAge;
    }
    public String getBeneficiary() {
        return beneficiary;
    }
    public void setBeneficiary(String beneficiary) {
        this.beneficiary = beneficiary;
    }
    public Insurance getInsurance() {
        return insurance;
    }
    public void setInsurance(Insurance insurance) {
        this.insurance = insurance;
    }
    public List<InsuranceProgram> getProgramList() {
        return programList;
    }
    public void setProgramList(List<InsuranceProgram> programList) {
        this.programList = programList;
    }
    @Override
    public String toString() {
        return "InsuranceOrder [insurancePoliceNo=" + insurancePoliceNo
                + ", insuranceApplicationNo=" + insuranceApplicationNo
                + ", policeHolderName=" + policeHolderName
                + ", policeHolderBirthday=" + policeHolderBirthday
                + ", policeHolderGender=" + policeHolderGender
                + ", policeHolderID=" + policeHolderID + ", insuredName="
                + insuredName + ", insuredGender=" + insuredGender
                + ", insuredBirthday=" + insuredBirthday + ", insuredID="
                + insuredID + ", insuredAge=" + insuredAge + ", beneficiary="
                + beneficiary + ", insurance=" + insurance + ", programList="
                + programList + "]";
    }

}

InsuranceProgram.java

package com.insurance.pojo;

/**
 * 保险项目
 * @author yinz
 *
 */
public class InsuranceProgram {

    private String name;  //项目名称
    private String fee;  //金额
    public String getName() {
        return name;
    }
    public void setName(String name) {
        this.name = name;
    }
    public String getFee() {
        return fee;
    }
    public void setFee(String fee) {
        this.fee = fee;
    }
    @Override
    public String toString() {
        return "InsuranceProgram [name=" + name + ", fee=" + fee + "]";
    }

}

此处用于读取的pdf文件:http://files.cnblogs.com/files/yinz/场景1_样例_电子保单识别.rar

时间: 2024-10-20 14:09:54

pdfBox 读取pdf文件的相关文章

深入学习python解析并读取PDF文件内容的方法

这篇文章主要学习了python解析并读取PDF文件内容的方法,包括对学习库的应用,python2.7和python3.6中python解析PDF文件内容库的更新,包括对pdfminer库的详细解释和应用.主要参考了一些已有的博客内容,代码. 主要思路是首先利用一个做项目的形式,描述所做的问题,运行环境,和需要安装的库,然后写代码,此代码是在python2.7中运行,然后写出在python3.6中运行的代码,并详细解释python2.7和python3.6中python库的一些不同之处,最后详细的

pdf.js如何跨域读取pdf文件?

今天,上线一个客户网站之后(使用的是广州新一代虚拟空间)发现在读取上传的pdf文件的时候读取错误,通过直接在浏览器输入文件地址的时候发现文件地址被重定向了(呵呵!),结果就是pdf文件源由本地直接变成了跨域获取.解决问题吧! 1.pdf.js获取文件的方法 You can modify the defaultUrl app option in the web/app_options.js file or you can append the ?file= query string to the

记一次为解决Python读取PDF文件的Shell操作

目录 一.背景 二.问题 三.解决 四.一顿分析及 Shell 操作 五.后续 一.背景 本想将 PDF 文件转换为 Word 文档,然后网上搜索了一下发现有挺多转换的软件.有的是免费的.收费,咱也不知哪个好使,还得一个个安装试用.先不说能不解决问题,就这安装试用想想就脑壳疼.便想起了"Python 大法",随即搜了几篇看起来比较完整的博客,二话不说粘贴复制,改改运行试试.使用环境(python3.6+pdfminer3k),代码这里就不放出来了. 二.问题 运气不好,这一试就报错WA

PDFBox 解析PDF文件-解析服务器文件

1.首先引进pom <!-- PDF读取依赖 --><dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.4</version></dependency> 2.controller层直接代码 /** * PDF解析 * @return */@PostMapping(

C# PDFBox 解析PDF文件

下载 PDFBox-0.7.3.zip PDFBox-0.7.3.dlllucene-demos-2.0.0.dlllucene-core-2.0.0.dllbcmail-jdk14-132.dllbcprov-jdk14-132.dllFontBox-0.1.0-dev.dllICSharpCode.SharpZipLib.dllIKVM.AWT.WinForms.dllIKVM.GNU.Classpath.dllIKVM.Runtime.dllikvm-native.dll放入Bin中 C#

java使用pdfbox操作pdf文件

import java.io.FileInputStream; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; public class Read {  public String rea

iOS开发:读取pdf文件

方法一:使用QLPreviewController #pragma mark  浏览存在沙盒的文件 -(void)quickLook { QLPreviewController *QLPreviewVc = [[QLPreviewController alloc] initWithNibName:nil bundle:nil]; QLPreviewVc.dataSource = self; QLPreviewVc.delegate = self; [self presentViewControl

JAVA 读取pdf文件

第一个路口action /* * wuhan syspro author zhangrui 2010/08/23 */ package jp.co.syspro.poo.action; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.struts.action.Action; import org.apache.struts

使用pdfbox提取PDF文件中的flash文件

private static void parsePdfFile(String file) throws Exception { FileInputStream fis = new FileInputStream(file); PDFParser pdfParser = new PDFParser(fis); pdfParser.parse(); COSDocument cosDocument = pdfParser.getDocument(); List<COSObject> objList