发票
# 需要导入以下包import pdfplumber, os
from openpyxl import Workbook
import re
wenjian = Workbook()# 创建存放发票信息的文件
D = wenjian.active
D.column_dimensions['A'].width = 40
D.column_dimensions['B'].width = 20
D.column_dimensions['C'].width = 30
D.column_dimensions['D'].width = 40
D.column_dimensions['E'].width = 20
title = ["开票号码","开票日期", "买方信息","卖方信息","价税合计" ]# 先把要提取内容的抬头写入单元格
for h in range(1, 6):
D.cell(1, h, title)
# 下面是三种发票内容提取函数,其他格式我还没遇到
def write_anotherthing(B, text):
D = B.active
s1 = re.findall('发票号码\s*[::]\s*(\d+)', text, re.DOTALL)
s2 = re.findall('开票日期\s*[::]\s*(.*?)\n', text, re.DOTALL)
s3 = re.findall('价税合计.*?[¥¥]\s*([\d.]+)', text, re.DOTALL)
s4 = re.findall('[销售]\s*名称\s*[::]\s*(.*?)\s', text, re.DOTALL)
s5 = re.findall('[购买]\s*名称\s*[::]\s*(.*?)\s', text, re.DOTALL)
D.cell(i, 1, s1)
D.cell(i, 2, s2)
D.cell(i, 5, s3)
D.cell(i, 4, s4)
D.cell(i, 3, s5)
print(s5,s3)
i = 2# 从表格第二行开始写入数据
# 把需要提取的票PDF放在一个文件夹,下面开始提取文件名
name = '电子发票'# 放待提取发票的文件夹名
targetDir = fr"D:\{name}"# 存放待提取发票的完整路径
# 下面三行提取所有发票PDF的文件名
files = []
for (dirpath, dirnames, filenames) in os.walk(targetDir):
files += filenames
# 开始读取每张发票内容,并判断属于什么类型发票
for file in files:
with pdfplumber.open(fr"D:\{name}\{file}") as pdf:
# print(f"已轮到{file}")
print(f'D:\{name}\{file}')
A = pdf.pages
text = A.extract_text()
search_string1 = "发票代码"
search_string2 = "票据代码"
write_anotherthing(wenjian, text)
# print(f"{file}已印")
i = i + 1
# 提取好后保存文件
wenjian.save(fr"D:\{name}\{name}.xlsx")
import pdfplumber,os
from openpyxl import Workbook
wenjian = Workbook()# 创建存放发票信息的文件
D = wenjian.active
D.column_dimensions['A'].width = 40
D.column_dimensions['B'].width = 20
D.column_dimensions['C'].width = 30
D.column_dimensions['D'].width = 40
D.column_dimensions['E'].width = 20
title = ["开票号码","开票日期", "买方信息","卖方信息","价税合计" ]# 先把要提取内容的抬头写入单元格
for h in range(1, 6):
D.cell(1, h, title)
i = 2
path = r'C:\Users\乾舟\PycharmProjects\pythonProject\LF\电子发票\\'
list = []
files = os.listdir(path)
for file in files:
print(path+file)
with pdfplumber.open(path+file) as pdf:
page = pdf.pages
text = page.extract_text()
# print(text)
xuhao = text.split('发票号码:').split()
# print(xuhao)
riqi = text.split('开票日期:').split()
# print(riqi)
goumai = text.split('购 名称:').split()
# print(goumai)
maijia = text.split('销 名称:').split()
# print(maijia)
total = text.split('(小写)¥').split()
# print(total)
list.append(xuhao)
list.append(riqi)
list.append(goumai)
list.append(maijia)
list.append(total)
wenjian.active.cell(i,1,xuhao)
wenjian.active.cell(i,2,riqi)
wenjian.active.cell(i,3,goumai)
wenjian.active.cell(i,4,maijia)
wenjian.active.cell(i,5,total)
print(list)
i = i +1
wenjian.save('001.xlsx')
import pdfplumber,os
from openpyxl import Workbook
wenjian = Workbook()# 创建存放发票信息的文件
D = wenjian.active
D.column_dimensions['A'].width = 40
D.column_dimensions['B'].width = 20
D.column_dimensions['C'].width = 30
D.column_dimensions['D'].width = 40
D.column_dimensions['E'].width = 20
title = ["开票号码","开票日期", "买方信息","卖方信息","价税合计" ]# 先把要提取内容的抬头写入单元格
for h in range(1, 6):
D.cell(1, h, title)
i = 2
path = r'C:\Users\乾舟\PycharmProjects\pythonProject\LF\20241123\20241123\发票\\'
list = []
files = os.listdir(path)
for file in files:
print(path+file)
with pdfplumber.open(path+file) as pdf:
page = pdf.pages
text = page.extract_text()
# print(text)
xuhao = text.split('发票号码:').split()
# print(xuhao)
riqi = text.split('开票日期:').split()
# print(riqi)
goumai = text.split('购 名称:').split()
# print(goumai)
maijia = text.split('销 名称:').split()
# print(maijia)
total = text.split('(小写)¥').split()
total = float(total)
# print(total)
wenjian.active.cell(i,1,xuhao)
wenjian.active.cell(i,2,riqi)
wenjian.active.cell(i,3,goumai)
wenjian.active.cell(i,4,maijia)
wenjian.active.cell(i,5,total)
print(list)
i = i +1
wenjian.save('001.xlsx')
import pdfplumber,os
from openpyxl import Workbook
wenjian = Workbook()# 创建存放发票信息的文件
D = wenjian.active
D.column_dimensions['A'].width = 30
D.column_dimensions['B'].width = 30
D.column_dimensions['C'].width = 20
D.column_dimensions['D'].width = 40
D.column_dimensions['E'].width = 20
title = ["甲方","乙方", "金额" ]# 先把要提取内容的抬头写入单元格
for h in range(1, 4):
D.cell(1, h, title)
i = 2
path = r'C:\Users\乾舟\PycharmProjects\pythonProject\LF\20241123\20241123\合同\\'
list = []
files = os.listdir(path)
for file in files:
print(path+file)
with pdfplumber.open(path+file) as pdf:
page = pdf.pages
text = page.extract_text()
print(text)
print('---'*20)
jiafang = text.split('下简称甲方):').split()
print(jiafang)
yifang = text.split('简称乙方):').split()
print(yifang)
total = text.split('合计 ').split()
print(total)
total= float(total)
wenjian.active.cell(i,1,jiafang)
wenjian.active.cell(i,2,yifang)
wenjian.active.cell(i,3,total)
i = i +1
wenjian.save('002.xlsx')
页:
[1]