فهرست منبع

add lab-tesseract

westinyang 3 سال پیش
والد
کامیت
abe4e3dd5c
100فایلهای تغییر یافته به همراه758 افزوده شده و 0 حذف شده
  1. 1 0
      lab-tesseract/.gitignore
  2. 668 0
      lab-tesseract/@res/yunnan.chinatax.gov.cn.har
  3. BIN
      lab-tesseract/@res/发票.pdf
  4. BIN
      lab-tesseract/@res/接口.jpg
  5. 7 0
      lab-tesseract/@res/网址.txt
  6. BIN
      lab-tesseract/@res/网页.png
  7. 10 0
      lab-tesseract/README.md
  8. 41 0
      lab-tesseract/clear.py
  9. 31 0
      lab-tesseract/download.py
  10. BIN
      lab-tesseract/image/1.png
  11. BIN
      lab-tesseract/image/10.png
  12. BIN
      lab-tesseract/image/100.png
  13. BIN
      lab-tesseract/image/11.png
  14. BIN
      lab-tesseract/image/12.png
  15. BIN
      lab-tesseract/image/13.png
  16. BIN
      lab-tesseract/image/14.png
  17. BIN
      lab-tesseract/image/15.png
  18. BIN
      lab-tesseract/image/16.png
  19. BIN
      lab-tesseract/image/17.png
  20. BIN
      lab-tesseract/image/18.png
  21. BIN
      lab-tesseract/image/19.png
  22. BIN
      lab-tesseract/image/2.png
  23. BIN
      lab-tesseract/image/20.png
  24. BIN
      lab-tesseract/image/21.png
  25. BIN
      lab-tesseract/image/22.png
  26. BIN
      lab-tesseract/image/23.png
  27. BIN
      lab-tesseract/image/24.png
  28. BIN
      lab-tesseract/image/25.png
  29. BIN
      lab-tesseract/image/26.png
  30. BIN
      lab-tesseract/image/27.png
  31. BIN
      lab-tesseract/image/28.png
  32. BIN
      lab-tesseract/image/29.png
  33. BIN
      lab-tesseract/image/3.png
  34. BIN
      lab-tesseract/image/30.png
  35. BIN
      lab-tesseract/image/31.png
  36. BIN
      lab-tesseract/image/32.png
  37. BIN
      lab-tesseract/image/33.png
  38. BIN
      lab-tesseract/image/34.png
  39. BIN
      lab-tesseract/image/35.png
  40. BIN
      lab-tesseract/image/36.png
  41. BIN
      lab-tesseract/image/37.png
  42. BIN
      lab-tesseract/image/38.png
  43. BIN
      lab-tesseract/image/39.png
  44. BIN
      lab-tesseract/image/4.png
  45. BIN
      lab-tesseract/image/40.png
  46. BIN
      lab-tesseract/image/41.png
  47. BIN
      lab-tesseract/image/42.png
  48. BIN
      lab-tesseract/image/43.png
  49. BIN
      lab-tesseract/image/44.png
  50. BIN
      lab-tesseract/image/45.png
  51. BIN
      lab-tesseract/image/46.png
  52. BIN
      lab-tesseract/image/47.png
  53. BIN
      lab-tesseract/image/48.png
  54. BIN
      lab-tesseract/image/49.png
  55. BIN
      lab-tesseract/image/5.png
  56. BIN
      lab-tesseract/image/50.png
  57. BIN
      lab-tesseract/image/51.png
  58. BIN
      lab-tesseract/image/52.png
  59. BIN
      lab-tesseract/image/53.png
  60. BIN
      lab-tesseract/image/54.png
  61. BIN
      lab-tesseract/image/55.png
  62. BIN
      lab-tesseract/image/56.png
  63. BIN
      lab-tesseract/image/57.png
  64. BIN
      lab-tesseract/image/58.png
  65. BIN
      lab-tesseract/image/59.png
  66. BIN
      lab-tesseract/image/6.png
  67. BIN
      lab-tesseract/image/60.png
  68. BIN
      lab-tesseract/image/61.png
  69. BIN
      lab-tesseract/image/62.png
  70. BIN
      lab-tesseract/image/63.png
  71. BIN
      lab-tesseract/image/64.png
  72. BIN
      lab-tesseract/image/65.png
  73. BIN
      lab-tesseract/image/66.png
  74. BIN
      lab-tesseract/image/67.png
  75. BIN
      lab-tesseract/image/68.png
  76. BIN
      lab-tesseract/image/69.png
  77. BIN
      lab-tesseract/image/7.png
  78. BIN
      lab-tesseract/image/70.png
  79. BIN
      lab-tesseract/image/71.png
  80. BIN
      lab-tesseract/image/72.png
  81. BIN
      lab-tesseract/image/73.png
  82. BIN
      lab-tesseract/image/74.png
  83. BIN
      lab-tesseract/image/75.png
  84. BIN
      lab-tesseract/image/76.png
  85. BIN
      lab-tesseract/image/77.png
  86. BIN
      lab-tesseract/image/78.png
  87. BIN
      lab-tesseract/image/79.png
  88. BIN
      lab-tesseract/image/8.png
  89. BIN
      lab-tesseract/image/80.png
  90. BIN
      lab-tesseract/image/81.png
  91. BIN
      lab-tesseract/image/82.png
  92. BIN
      lab-tesseract/image/83.png
  93. BIN
      lab-tesseract/image/84.png
  94. BIN
      lab-tesseract/image/85.png
  95. BIN
      lab-tesseract/image/86.png
  96. BIN
      lab-tesseract/image/87.png
  97. BIN
      lab-tesseract/image/88.png
  98. BIN
      lab-tesseract/image/89.png
  99. BIN
      lab-tesseract/image/9.png
  100. BIN
      lab-tesseract/image/90.png

+ 1 - 0
lab-tesseract/.gitignore

@@ -0,0 +1 @@
+/clear_image

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 668 - 0
lab-tesseract/@res/yunnan.chinatax.gov.cn.har


BIN
lab-tesseract/@res/发票.pdf


BIN
lab-tesseract/@res/接口.jpg


+ 7 - 0
lab-tesseract/@res/网址.txt

@@ -0,0 +1,7 @@
+https://yunnan.chinatax.gov.cn/col/col8041/index.html
+https://app.yunnan.chinatax.gov.cn/static/pages/ynds/fpcx/fpcx3.html?gdslxdm=1
+
+153002209100
+03839955
+915303005577687468
+dfefb

BIN
lab-tesseract/@res/网页.png


+ 10 - 0
lab-tesseract/README.md

@@ -0,0 +1,10 @@
+研究使用 `tesseract` 和相关技术进行验证码识别
+
+需求资源:[@res](/%40res/)
+
+参考文章:https://blog.csdn.net/u011457798/article/details/84063963
+
+- [√] 使用官方的数据模型进行识别
+- [×] 优化和提升图像降噪处理技术
+- [×] 使用自己训练的模型进行识别
+- [×] 使用Python、Java等语言的绑定开发接口应用,以及脱离安装 `tesseract` 程序安装

+ 41 - 0
lab-tesseract/clear.py

@@ -0,0 +1,41 @@
+import os
+from PIL import Image
+
+def clear_save_image(image_path):
+	image = Image.open(image_path)
+	image = clear_image(image)
+	image = image.convert('L') #灰度处理
+	file_name = os.path.splitext(os.path.basename(image_path))[0]
+	save_path = os.path.join(os.path.dirname(image_path), '../clear_image', file_name + '.jpg')
+	image.save(save_path) #图片转换成tif保存到clear_image文件夹中
+
+def clear_image(image):
+	image = image.convert('RGB')
+	width = image.size[0]
+	height = image.size[1]
+	noise_color = get_noise_color(image)
+	for x in range(width):
+		for y in  range(height):
+			#清除边框和干扰色
+			if (x == 0 or y == 0 or x == width - 1 or y == height - 1 
+				or image.getpixel((x, y)) == noise_color):
+				image.putpixel((x, y), (255, 255, 255))
+			#背景调整为白色	
+			if (image.getpixel((x, y))[0] > 180 and image.getpixel((x, y))[1] > 180 and image.getpixel((x, y))[2] > 180):
+				image.putpixel((x, y), (255, 255, 255))
+	return image
+
+def get_noise_color(image):
+	for y in range(1, image.size[1] - 1):
+		# 获取第2列非白的颜色
+		(r, g, b) = image.getpixel((2, y))
+		if r < 255 and g < 255 and b < 255:
+			return (r, g, b)
+
+if __name__ == '__main__':
+	os.makedirs('./clear_image/', exist_ok=True)
+	for file_path in os.listdir('image'):
+		file_path = os.path.join('image', file_path)
+		if os.path.isfile(file_path):
+			clear_save_image(file_path)
+	print("clear end.")

+ 31 - 0
lab-tesseract/download.py

@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import requests  # http客户端
+import os  # 创建文件夹
+import time
+from PIL import Image
+
+os.makedirs('./image/', exist_ok=True)
+IMAGE_URL = "https://app.yunnan.chinatax.gov.cn/app/base/captcha.do?time=" + (str(round(time.time() * 1000)))
+print(IMAGE_URL)
+
+
+head = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
+}
+
+def request_download(num):
+    r = requests.get(IMAGE_URL, headers=head)
+    with open('./image/%s.png' % str(num), 'wb') as f:
+        f.write(r.content)
+
+
+try:
+    for i in range(100):
+        request_download((i+1))
+        print('download to ./image/%s.png' % str((i+1)))
+        #im = Image.open('./image/img.png')
+        #im.show()
+except:
+    print('download error!')

BIN
lab-tesseract/image/1.png


BIN
lab-tesseract/image/10.png


BIN
lab-tesseract/image/100.png


BIN
lab-tesseract/image/11.png


BIN
lab-tesseract/image/12.png


BIN
lab-tesseract/image/13.png


BIN
lab-tesseract/image/14.png


BIN
lab-tesseract/image/15.png


BIN
lab-tesseract/image/16.png


BIN
lab-tesseract/image/17.png


BIN
lab-tesseract/image/18.png


BIN
lab-tesseract/image/19.png


BIN
lab-tesseract/image/2.png


BIN
lab-tesseract/image/20.png


BIN
lab-tesseract/image/21.png


BIN
lab-tesseract/image/22.png


BIN
lab-tesseract/image/23.png


BIN
lab-tesseract/image/24.png


BIN
lab-tesseract/image/25.png


BIN
lab-tesseract/image/26.png


BIN
lab-tesseract/image/27.png


BIN
lab-tesseract/image/28.png


BIN
lab-tesseract/image/29.png


BIN
lab-tesseract/image/3.png


BIN
lab-tesseract/image/30.png


BIN
lab-tesseract/image/31.png


BIN
lab-tesseract/image/32.png


BIN
lab-tesseract/image/33.png


BIN
lab-tesseract/image/34.png


BIN
lab-tesseract/image/35.png


BIN
lab-tesseract/image/36.png


BIN
lab-tesseract/image/37.png


BIN
lab-tesseract/image/38.png


BIN
lab-tesseract/image/39.png


BIN
lab-tesseract/image/4.png


BIN
lab-tesseract/image/40.png


BIN
lab-tesseract/image/41.png


BIN
lab-tesseract/image/42.png


BIN
lab-tesseract/image/43.png


BIN
lab-tesseract/image/44.png


BIN
lab-tesseract/image/45.png


BIN
lab-tesseract/image/46.png


BIN
lab-tesseract/image/47.png


BIN
lab-tesseract/image/48.png


BIN
lab-tesseract/image/49.png


BIN
lab-tesseract/image/5.png


BIN
lab-tesseract/image/50.png


BIN
lab-tesseract/image/51.png


BIN
lab-tesseract/image/52.png


BIN
lab-tesseract/image/53.png


BIN
lab-tesseract/image/54.png


BIN
lab-tesseract/image/55.png


BIN
lab-tesseract/image/56.png


BIN
lab-tesseract/image/57.png


BIN
lab-tesseract/image/58.png


BIN
lab-tesseract/image/59.png


BIN
lab-tesseract/image/6.png


BIN
lab-tesseract/image/60.png


BIN
lab-tesseract/image/61.png


BIN
lab-tesseract/image/62.png


BIN
lab-tesseract/image/63.png


BIN
lab-tesseract/image/64.png


BIN
lab-tesseract/image/65.png


BIN
lab-tesseract/image/66.png


BIN
lab-tesseract/image/67.png


BIN
lab-tesseract/image/68.png


BIN
lab-tesseract/image/69.png


BIN
lab-tesseract/image/7.png


BIN
lab-tesseract/image/70.png


BIN
lab-tesseract/image/71.png


BIN
lab-tesseract/image/72.png


BIN
lab-tesseract/image/73.png


BIN
lab-tesseract/image/74.png


BIN
lab-tesseract/image/75.png


BIN
lab-tesseract/image/76.png


BIN
lab-tesseract/image/77.png


BIN
lab-tesseract/image/78.png


BIN
lab-tesseract/image/79.png


BIN
lab-tesseract/image/8.png


BIN
lab-tesseract/image/80.png


BIN
lab-tesseract/image/81.png


BIN
lab-tesseract/image/82.png


BIN
lab-tesseract/image/83.png


BIN
lab-tesseract/image/84.png


BIN
lab-tesseract/image/85.png


BIN
lab-tesseract/image/86.png


BIN
lab-tesseract/image/87.png


BIN
lab-tesseract/image/88.png


BIN
lab-tesseract/image/89.png


BIN
lab-tesseract/image/9.png


BIN
lab-tesseract/image/90.png


برخی فایل ها در این مقایسه diff نمایش داده نمی شوند زیرا تعداد فایل ها بسیار زیاد است