Skip to content

Commit

Permalink
Make OCR multi-thread etc.
Browse files Browse the repository at this point in the history
  • Loading branch information
pudo committed Jun 24, 2019
1 parent 1402d5a commit 38635b8
Show file tree
Hide file tree
Showing 8 changed files with 61 additions and 123 deletions.
9 changes: 5 additions & 4 deletions aleph/logic/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,19 @@ def process_collection(collection, ingest=True):
"""Trigger a full re-parse of all documents and re-build the
search index from the aggregator."""
aggregator = get_aggregator(collection)
aggregator.delete()
if ingest:
aggregator.delete()
try:
writer = aggregator.bulk()
for proxy in _collection_proxies(collection):
writer.put(proxy, fragment='db')
if ingest:
ingest_entity(collection, proxy)
writer.flush()
if not ingest:
index_entities(collection, aggregator.iterate())
else:
if ingest:
ingest_wait(collection)
else:
index_entities(collection, aggregator.iterate())
finally:
aggregator.close()

Expand Down
14 changes: 7 additions & 7 deletions aleph/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ def flushdeleted():

@manager.command
@manager.option('-f', '--foreign-id')
@manager.option('-p', '--process', is_flag=True, default=False)
@manager.option('-i', '--ingest', is_flag=True, default=False)
def update(foreign_id=None, process=False, ingest=False):
@manager.option('--index', is_flag=True, default=False)
@manager.option('--process', is_flag=True, default=False)
def update(foreign_id=None, index=False, process=False):
"""Re-index all the collections and entities."""
update_roles()
q = Collection.all(deleted=True)
Expand All @@ -102,8 +102,8 @@ def update(foreign_id=None, process=False, ingest=False):
index_collection(collection)
if collection.deleted_at is not None:
continue
if process or ingest:
payload = {'ingest': ingest}
if index or process:
payload = {'ingest': process}
queue_task(collection, OP_PROCESS, payload=payload)


Expand All @@ -119,7 +119,7 @@ def xref(foreign_id, against=None):

@manager.command
def bulkload(file_name):
"""Index all the entities in a given dataset."""
"""Load entities from the specified mapping file."""
log.info("Loading bulk data from: %s", file_name)
config = load_config_file(file_name)
for foreign_id, data in config.items():
Expand All @@ -140,7 +140,7 @@ def status(foreign_id):

@manager.command
def cancel(foreign_id):
"""Cancel all queued tasks."""
"""Cancel all queued tasks for the dataset."""
collection = get_collection(foreign_id)
cancel_queue(collection)

Expand Down
3 changes: 2 additions & 1 deletion aleph/views/collections_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def update(collection_id):
def process(collection_id):
collection = get_db_collection(collection_id, request.authz.WRITE)
# re-process the documents
queue_task(collection, OP_PROCESS)
payload = {'ingest': get_flag('ingest', True)}
queue_task(collection, OP_PROCESS, payload=payload)
return ('', 202)


Expand Down
2 changes: 1 addition & 1 deletion services/convert-document/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ RUN apt-get -qq -y update \
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
fonts-tlwg-purisa python3-pip python3-uno python3-icu \
fonts-tlwg-purisa python3-pip python3-uno python3-lxml python3-icu \
&& apt-get -qq -y autoremove \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
Expand Down
25 changes: 25 additions & 0 deletions services/ingest-file/tests/fixtures/john-doe.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
BEGIN:VCARD
VERSION:3.0
N:Doe;John;;;
FN:John Doe
ORG:Example.com Inc.;
TITLE:Imaginary test person
EMAIL;type=INTERNET;type=WORK;type=pref:johnDoe@example.org
TEL;type=WORK;type=pref:+1 617 555 1212
TEL;type=WORK:+1 (617) 555-1234
TEL;type=CELL:+1 781 555 1212
TEL;type=HOME:+1 202 555 1212
item1.ADR;type=WORK:;;2 Enterprise Avenue;Worktown;NY;01111;USA
item1.X-ABADR:us
item2.ADR;type=HOME;type=pref:;;3 Acacia Avenue;Hoemtown;MA;02222;USA
item2.X-ABADR:us
NOTE:John Doe has a long and varied history\, being documented on more police files that anyone else. Reports of his death are alas numerous.
item3.URL;type=pref:http\://www.example/com/doe
item3.X-ABLabel:_$!<HomePage>!$_
item4.URL:http\://www.example.com/Joe/foaf.df
item4.X-ABLabel:FOAF
item5.X-ABRELATEDNAMES;type=pref:Jane Doe
item5.X-ABLabel:_$!<Friend>!$_
CATEGORIES:Work,Test group
X-ABUID:5AD380FD-B2DE-4261-BA99-DE1D1DB52FBE\:ABPerson
END:VCARD
92 changes: 0 additions & 92 deletions services/recognize-text/Dockerfile.alpine

This file was deleted.

20 changes: 11 additions & 9 deletions services/recognize-text/textrecognizer/recognize.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import time
import logging
import threading
from PIL import Image
from io import BytesIO
from languagecodes import list_to_alpha3 as alpha3
Expand All @@ -16,6 +17,7 @@ class OCR(object):
def __init__(self):
# Tesseract language types:
_, self.supported = get_languages()
self.tl = threading.local()

def language_list(self, languages):
models = [c for c in alpha3(languages) if c in self.supported]
Expand All @@ -27,21 +29,21 @@ def language_list(self, languages):

def configure_engine(self, languages, mode):
# log.info("Configuring OCR engine (%s)", languages)
if not hasattr(self, 'api') or self.api is None:
self.api = PyTessBaseAPI(lang=languages, oem=OEM.LSTM_ONLY)
if languages != self.api.GetInitLanguagesAsString():
self.api.Init(lang=languages, oem=OEM.LSTM_ONLY)
if mode != self.api.GetPageSegMode():
self.api.SetPageSegMode(mode)
return self.api
if not hasattr(self.tl, 'api') or self.tl.api is None:
self.tl.api = PyTessBaseAPI(lang=languages, oem=OEM.LSTM_ONLY)
if languages != self.tl.api.GetInitLanguagesAsString():
self.tl.api.Init(lang=languages, oem=OEM.LSTM_ONLY)
if mode != self.tl.api.GetPageSegMode():
self.tl.api.SetPageSegMode(mode)
return self.tl.api

def clear_engine(self):
"""Shut down tesseract and clear all memory."""
try:
self.api.End()
self.tl.api.End()
except Exception:
log.exception("Failed to shut down tesseract")
self.api = None
self.tl.api = None

def extract_text(self, data, languages=None, mode=DEFAULT_MODE):
"""Extract text from a binary string of data."""
Expand Down
19 changes: 10 additions & 9 deletions services/recognize-text/textrecognizer/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ def __init__(self):
self.ocr = OCR()

def Recognize(self, image, context):
acquired = self.lock.acquire(blocking=False)
if acquired is False:
context.set_code(grpc.StatusCode.RESOURCE_EXHAUSTED)
context.set_details('OCR engine is busy.')
return Text()
# acquired = self.lock.acquire(blocking=False)
# if acquired is False:
# context.set_code(grpc.StatusCode.RESOURCE_EXHAUSTED)
# context.set_details('OCR engine is busy.')
# return Text()

try:
mode = self.MODES.get(image.mode, PSM.AUTO_OSD)
Expand All @@ -41,13 +41,14 @@ def Recognize(self, image, context):
log.exception("Failed OCR.")
self.ocr.clear_engine()
context.abort(grpc.StatusCode.INTERNAL, str(exc))
finally:
self.lock.release()

# finally:
# self.lock.release()


def serve(port):
options = [('grpc.max_receive_message_length', 10 * 1024 * 1024)]
executor = futures.ThreadPoolExecutor(max_workers=3)
options = [('grpc.max_receive_message_length', 20 * 1024 * 1024)]
executor = futures.ThreadPoolExecutor(max_workers=4)
server = grpc.server(executor, options=options)
add_RecognizeTextServicer_to_server(OCRServicer(), server)
server.add_insecure_port(port)
Expand Down

0 comments on commit 38635b8

Please sign in to comment.