浏览代码

修改关键字提取函数

zyc 3 年之前
父节点
当前提交
175f41002b
共有 2 个文件被更改,包括 30 次插入11 次删除
  1. 30 11
      utils/PaddleOCR/deploy/hubserving/ocr_system/module.py
  2. 二进制
      utils/PaddleOCR/doc/imgs/test3.png

+ 30 - 11
utils/PaddleOCR/deploy/hubserving/ocr_system/module.py

@@ -96,7 +96,7 @@ class OCRSystem:
                 continue
 
             # 图片预处理
-            img = self.resizeImg(img, 800)
+            img = self.resizeImg(img, 640)
 
             starttime = time.time()
             dt_boxes, rec_res = self.text_sys(img)
@@ -111,10 +111,11 @@ class OCRSystem:
                 text, score = rec_res[dno]
                 if score > 0.8:
                     text_list.append(text)
-                    # print(text)
+                    print(text)
                     self.getInformation(text, kwargs['invoice_type'])
 
             inv_text = ''.join(text_list)
+            print(inv_text)
             self.getInformationAgain(inv_text, kwargs['invoice_type'])
             all_results.append({
                 'no': self.inv_no,
@@ -134,6 +135,10 @@ class OCRSystem:
             pt = re.compile(r'N[\w|\s]?(\d{8})', re.M)
             information_list = pt.findall(string)
             self.inv_no = information_list[0] if len(information_list) != 0 else ""
+        if self.inv_no == "":
+            pt = re.compile(r'号码:(\d{8})', re.M)
+            information_list = pt.findall(string)
+            self.inv_no = information_list[0] if len(information_list) != 0 else ""
 
         if self.inv_id == "":
             if invoice_type == 1:
@@ -142,15 +147,29 @@ class OCRSystem:
                 pt = re.compile(r'(\d{10})N', re.M)
             information_list = pt.findall(string)
             self.inv_id = information_list[0] if len(information_list) != 0 else ""
+        if self.inv_id == "":
+            if invoice_type == 1:
+                pt = re.compile(r'代码:(\d{12})', re.M)
+            else:
+                pt = re.compile(r'代码:(\d{10})', re.M)
+            information_list = pt.findall(string)
+            self.inv_id = information_list[0] if len(information_list) != 0 else ""
 
         if self.inv_company[1] == '':
-            pt = re.compile(r'称:(.*?)[-*+></\d]?[纳税]', re.M)
+            pt = re.compile(r'称:(.*?)[-*+></\d]?[纳税]', re.M)
             information_list = pt.findall(string)
             if len(information_list) != 0:
                 for i in range(len(self.inv_company)):
-                    if self.inv_company[i] == '':
-                        if len(information_list) != 0:
-                            self.inv_company[i] = information_list.pop(0)
+                    if len(information_list) != 0:
+                        self.inv_company[i] = information_list.pop(0)
+
+        if self.inv_identifier[1] == '':
+            pt = re.compile(r'别号:([a-zA-Z\d]{18})', re.M)
+            information_list = pt.findall(string)
+            if len(information_list) != 0:
+                for i in range(len(self.inv_identifier)):
+                    if len(information_list) != 0:
+                        self.inv_identifier[i] = information_list.pop(0)
 
 
         if self.inv_payee == "":
@@ -195,28 +214,28 @@ class OCRSystem:
                 return True
 
         if self.inv_payee == "":
-            pt = re.compile(r'款人:(.*)', re.M)
+            pt = re.compile(r'款人:(.*)$', re.M)
             information_list = pt.findall(string)
             self.inv_payee = information_list[0] if len(information_list) != 0 else ""
             if self.inv_payee != "":
                 return True
 
         if self.inv_review == "":
-            pt = re.compile(r'复核:(.*)', re.M)
+            pt = re.compile(r'复核:(.*)$', re.M)
             information_list = pt.findall(string)
             self.inv_review = information_list[0] if len(information_list) != 0 else ""
             if self.inv_review != "":
                 return True
 
         if self.inv_drawer == "":
-            pt = re.compile(r'票人:(.*)', re.M)
+            pt = re.compile(r'票人:(.*)$', re.M)
             information_list = pt.findall(string)
             self.inv_drawer = information_list[0] if len(information_list) != 0 else ""
             if self.inv_drawer != "":
                 return True
 
         if self.inv_identifier[1] == '':
-            pt = re.compile(r'[别号:]?([a-zA-Z\d]{18})$', re.M)
+            pt = re.compile(r'^[纳税人识别号:]?([a-zA-Z\d]{18})$', re.M)
             information_list = pt.findall(string)
             if len(information_list) != 0:
                 for i in range(len(self.inv_identifier)):
@@ -225,7 +244,7 @@ class OCRSystem:
                         return True
 
         if self.inv_identifier[1] == '':
-            pt = re.compile(r'称:(.*)', re.M)
+            pt = re.compile(r'称:(.*)$', re.M)
             information_list = pt.findall(string)
             if len(information_list) != 0:
                 for i in range(len(self.inv_company)):

二进制
utils/PaddleOCR/doc/imgs/test3.png