Import Modules¶

In [9]:
import matplotlib.pyplot as plt
import PIL
import pytesseract
import re
%matplotlib inline
In [3]:
# prerequisites
# !pip install pytesseract
# install desktop version of pytesseract

Load the image¶

In [4]:
img = PIL.Image.open('test.JPG')
plt.imshow(img)
Out[4]:
<matplotlib.image.AxesImage at 0x2c8db9a2d00>

Convert Image to Text¶

In [5]:
# config
pytesseract.pytesseract.tesseract_cmd = 'Users/thomas/tesseract/tessdata'
TESSDATA_PREFIX = 'Users/thomas/tesseract'
In [6]:
text_data = pytesseract.image_to_string(img.convert('RGB'), lang='eng')
In [8]:
print(text_data)
Name: Sample

Unique Policy Number: 12345
Amount: 100000

Start Date: 1/10/2019

End Date: 1/11/2019

Geo-Coordinates: 13.89,83.49


Extract Specific Fields¶

In [10]:
m = re.search("Name: (\w+)", text_data)
name = m[1]
name
Out[10]:
'Sample'
In [15]:
m = re.search("Start Date: (\S+)", text_data)
start_date = m[1]
start_date
Out[15]:
'1/10/2019'
In [16]:
m = re.search("Geo-Coordinates: (\S+)", text_data)
coordinates = m[1]
coordinates
Out[16]:
'13.89,83.49'
In [ ]: