-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
156 lines (124 loc) · 4.79 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import os
import csv
import pandas as pd
import numpy as np
import pytesseract
from PIL import Image
from preprocess_tools import *
from correction_tools import *
from pdf2image import convert_from_path
"""
TODOS:
1. Loop through directories
2. Get all ".jpg" and run script
3. Save all files in a TXT and CSV directory in the main directory using same structure as the one of looped directories
"""
def clean_text_column(image):
df = pytesseract.image_to_data(image, output_type='data.frame', lang="spa")
df = df.replace(r'^\s*$', np.nan, regex=True).dropna(subset=["text"])
# Confidence value:
df = df[df['conf'] > 25]
column_as_reference = df['block_num'].ffill()
output = df.groupby(column_as_reference, sort=False).first()
output['height'] = round(
df['height'].dropna().groupby(column_as_reference).mean(), 2)
output['width'] = round(df['width'].dropna().groupby(
column_as_reference).mean(), 2)
output['left'] = round(df['left'].dropna().groupby(
column_as_reference).mean(), 2)
output['top'] = round(df['top'].dropna().groupby(
column_as_reference).mean(), 2)
output['conf'] = round(df['conf'].dropna().groupby(
column_as_reference).mean(), 2)
output['text'] = df['text'].dropna().groupby(
column_as_reference).agg(' '.join)
try:
output['text'] = output['text'].str.replace('º','o')
except Exception as e:
print(f'Function gave error {e}')
output = output[output['text'].apply(lambda x: len(x.split(' ')) > 3)]
output.drop(columns=output.columns[:6], axis=1, inplace=True)
output["page"] = ""
output = output.reset_index(drop=True)
return output
def df_to_text_file(input, column_name, output_filename):
# Extract the "text" column from the DataFrame
text = input[str(column_name)].tolist()
# Write the elements in the "text" column to a .txt file
with open(str(output_filename) + ".txt", "w") as f:
for row in text:
f.write(row + "\n")
def image_to_txt_and_csv(input, output_filename):
image = preprocess(input)
df = clean_text_column(image)
for i in range(len(df['text'])):
df.iloc[int(i), 5] = correct_line(unite_sign(
str(df.iloc[int(i), 5])), "combined_big_text.txt")
df.to_csv(str(output_filename) + ".csv", index=False)
df_to_text_file(df, "text", str(output_filename))
def image_to_df(input):
image = preprocess(input)
df = clean_text_column(image)
idx_page = int(str(input)[str(input).find('#')+1:str(input).find('.')])
#idx_page = int(str(input)[str(input).find('#')+1:str(input).find('#')+3])
df["page"] = idx_page
try:
for i in range(len(df['text'])):
df.iloc[int(i), 5] = correct_line(unite_sign(str(df.iloc[int(i), 5])), "combined_big_text.txt")
except Exception as e:
print(f'Function gave error {e}')
return df
# df.to_csv(str(output_filename) + ".csv", index=False)
return df
##### TESTING #####
# image_to_txt_and_csv(get_fullpath(os.getcwd(), "test2.jpg"), "testing")
# print("allgood")
directory = get_fullpath(os.getcwd(), "Data/Trome/2020")
for subdir, dirs, files in os.walk(directory):
imgs = []
for file in files:
if file.endswith(".png"):
imgs.append(os.path.join(subdir, file))
elif file.endswith(".pdf"):
continue
print("Working on:", file)
path = os.path.join(subdir, file[6:16])
#print(path)
pages = convert_from_path(os.path.join(subdir, file), 600, fmt='jpeg')
if not os.path.exists(path):
os.makedirs(path)
for idx, val in enumerate(pages):
file_name = str(file[:-4]) + "#" + f"{int(idx)+1:02d}" + ".jpg"
save_path = os.path.join(path, file_name)
if not os.path.exists(save_path):
val.save(save_path, 'JPEG')
print("done", save_path)
imgs.append(save_path)
# Sorts by name
if imgs != []:
imgs = sorted(imgs)
else:
pass
df = pd.DataFrame()
name = str(os.path.basename(subdir)) + ".csv"
path = str(os.path.dirname(subdir))
completeName = os.path.join(path, name)
# f = open(completeName, "w+")
for element in imgs:
print("Working on: " + element)
# print(image_to_df(element))
try:
df_toadd = image_to_df(element)
except Exception as e:
print("Error while preprocessing ", e)
continue
if len(df_toadd) < 2:
print("TOO SHORT")
continue
df = pd.concat([df, df_toadd])
df = df.reset_index(drop=True)
print(len(df))
if not df.empty:
df.to_csv(completeName, index=False)
else:
continue