下面的代码可以实现用python读取PDF,包括读取本地和网络上的PDF。
pdfminer下载地址:https://pypi.python.org/packages/source/p/pdfminer/pdfminer-20140328.tar.gz
#!/usr/bin/python# -*- encoding:utf-8 -*- from urllib2 import urlopenfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.converter import TextConverterfrom pdfminer.layout import LAParamsfrom pdfminer.pdfpage import PDFPagefrom cStringIO import StringIO def convert_pdf_to_txt(fp): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = ‘utf-8‘ laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() textstr = retstr.getvalue() retstr.close() return textstr url=‘http://pythonscraping.com/pages/warandpeace/chapter1.pdf‘fp = StringIO(urlopen(url).read()) # for url # path=‘chapter1.pdf‘# fp = file(path, ‘rb‘) # for path text=convert_pdf_to_txt(fp)print text
时间: 2024-08-10 21:29:49