Skip to content

Add docker support #9

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 125 additions & 2 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,133 @@

""")
languages = {
'Afrikaans': 'afr',
'Amharic': 'amh',
'Arabic': 'ara',
'Assamese': 'asm',
'Azerbaijani': 'aze',
'Azerbaijani - Cyrillic': 'aze_cyrl',
'Belarusian': 'bel',
'Bengali': 'ben',
'Tibetan': 'bod',
'Bosnian': 'bos',
'Breton': 'bre',
'Bulgarian': 'bul',
'Catalan; Valencian': 'cat',
'Cebuano': 'ceb',
'Czech': 'ces',
'Chinese - Simplified': 'chi_sim',
'Chinese - Traditional': 'chi_tra',
'Cherokee': 'chr',
'Corsican': 'cos',
'Welsh': 'cym',
'Danish': 'dan',
'Danish - Fraktur (contrib)': 'dan_frak',
'German': 'deu',
'German - Fraktur (contrib)': 'deu_frak',
'German (Fraktur Latin)': 'deu_latf',
'Dzongkha': 'dzo',
'Greek, Modern (1453-)': 'ell',
'English': 'eng',
'English, Middle (1100-1500)': 'enm',
'Esperanto': 'epo',
'Math / equation detection module': 'equ',
'Estonian': 'est',
'Basque': 'eus',
'Faroese': 'fao',
'Persian': 'fas',
'Filipino (old - Tagalog)': 'fil',
'Finnish': 'fin',
'French': 'fra',
'Arabic': 'ara',
'Spanish': 'spa',
'German - Fraktur (now deu_latf)': 'frk',
'French, Middle (ca.1400-1600)': 'frm',
'Western Frisian': 'fry',
'Scottish Gaelic': 'gla',
'Irish': 'gle',
'Galician': 'glg',
'Greek, Ancient (to 1453) (contrib)': 'grc',
'Gujarati': 'guj',
'Haitian; Haitian Creole': 'hat',
'Hebrew': 'heb',
'Hindi': 'hin',
'Croatian': 'hrv',
'Hungarian': 'hun',
'Armenian': 'hye',
'Inuktitut': 'iku',
'Indonesian': 'ind',
'Icelandic': 'isl',
'Italian': 'ita',
'Italian - Old': 'ita_old',
'Javanese': 'jav',
'Japanese': 'jpn',
'Kannada': 'kan',
'Georgian': 'kat',
'Georgian - Old': 'kat_old',
'Kazakh': 'kaz',
'Central Khmer': 'khm',
'Kirghiz; Kyrgyz': 'kir',
'Kurmanji (Kurdish - Latin Script)': 'kmr',
'Korean': 'kor',
'Korean (vertical)': 'kor_vert',
'Kurdish (Arabic Script)': 'kur',
'Lao': 'lao',
'Latin': 'lat',
'Latvian': 'lav',
'Lithuanian': 'lit',
'Luxembourgish': 'ltz',
'Malayalam': 'mal',
'Marathi': 'mar',
'Macedonian': 'mkd',
'Maltese': 'mlt',
'Mongolian': 'mon',
'Maori': 'mri',
'Malay': 'msa',
'Burmese': 'mya',
'Nepali': 'nep',
'Dutch; Flemish': 'nld',
'Norwegian': 'nor',
'Occitan (post 1500)': 'oci',
'Oriya': 'ori',
'Orientation and script detection module': 'osd',
'Panjabi; Punjabi': 'pan',
'Polish': 'pol',
'Portuguese': 'por',
'Pushto; Pashto': 'pus',
'Quechua': 'que',
'Romanian; Moldavian; Moldovan': 'ron',
'Russian': 'rus',
'Sanskrit': 'san',
'Sinhala; Sinhalese': 'sin',
'Slovak': 'slk',
'Slovak - Fraktur (contrib)': 'slk_frak',
'Slovenian': 'slv',
'Sindhi': 'snd',
'Spanish; Castilian': 'spa',
'Spanish; Castilian - Old': 'spa_old',
'Albanian': 'sqi',
'Serbian': 'srp',
'Serbian - Latin': 'srp_latn',
'Sundanese': 'sun',
'Swahili': 'swa',
'Swedish': 'swe',
'Syriac': 'syr',
'Tamil': 'tam',
'Tatar': 'tat',
'Telugu': 'tel',
'Tajik': 'tgk',
'Tagalog (new - Filipino)': 'tgl',
'Thai': 'tha',
'Tigrinya': 'tir',
'Tonga': 'ton',
'Turkish': 'tur',
'Uighur; Uyghur': 'uig',
'Ukrainian': 'ukr',
'Urdu': 'urd',
'Uzbek': 'uzb',
'Uzbek - Cyrillic': 'uzb_cyrl',
'Vietnamese': 'vie',
'Yiddish': 'yid',
'Yoruba': 'yor'
}

with st.sidebar:
Expand Down
8 changes: 8 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
version: "3.3"
services:
pdf-text:
container_name: data-extractor
build: .
ports:
- "8501:8501"
restart: unless-stopped
21 changes: 21 additions & 0 deletions dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM python:3.12-slim

COPY * /app/
WORKDIR /app

RUN apt-get update && apt-get install -y \
build-essential \
curl \
software-properties-common \
git \
tesseract-ocr-all \
poppler-utils \
&& rm -rf /var/lib/apt/lists/*

RUN pip3 install -r requirements.txt

EXPOSE 8501

HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health

ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
130 changes: 127 additions & 3 deletions packages.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,129 @@
poppler-utils
tesseract-ocr
tesseract-ocr-spa
tesseract-ocr-fra
tesseract-ocr-ara
tesseract-ocr-afr
tesseract-ocr-amh
tesseract-ocr-ara
tesseract-ocr-asm
tesseract-ocr-aze
tesseract-ocr-aze_cyrl
tesseract-ocr-bel
tesseract-ocr-ben
tesseract-ocr-bod
tesseract-ocr-bos
tesseract-ocr-bre
tesseract-ocr-bul
tesseract-ocr-cat
tesseract-ocr-ceb
tesseract-ocr-ces
tesseract-ocr-chi_sim
tesseract-ocr-chi_tra
tesseract-ocr-chr
tesseract-ocr-cos
tesseract-ocr-cym
tesseract-ocr-dan
tesseract-ocr-dan_frak
tesseract-ocr-deu
tesseract-ocr-deu_frak
tesseract-ocr-deu_latf
tesseract-ocr-dzo
tesseract-ocr-ell
tesseract-ocr-eng
tesseract-ocr-enm
tesseract-ocr-epo
tesseract-ocr-equ
tesseract-ocr-est
tesseract-ocr-eus
tesseract-ocr-fao
tesseract-ocr-fas
tesseract-ocr-fil
tesseract-ocr-fin
tesseract-ocr-fra
tesseract-ocr-frk
tesseract-ocr-frm
tesseract-ocr-fry
tesseract-ocr-gla
tesseract-ocr-gle
tesseract-ocr-glg
tesseract-ocr-grc
tesseract-ocr-guj
tesseract-ocr-hat
tesseract-ocr-heb
tesseract-ocr-hin
tesseract-ocr-hrv
tesseract-ocr-hun
tesseract-ocr-hye
tesseract-ocr-iku
tesseract-ocr-ind
tesseract-ocr-isl
tesseract-ocr-ita
tesseract-ocr-ita_old
tesseract-ocr-jav
tesseract-ocr-jpn
tesseract-ocr-kan
tesseract-ocr-kat
tesseract-ocr-kat_old
tesseract-ocr-kaz
tesseract-ocr-khm
tesseract-ocr-kir
tesseract-ocr-kmr
tesseract-ocr-kor
tesseract-ocr-kor_vert
tesseract-ocr-kur
tesseract-ocr-lao
tesseract-ocr-lat
tesseract-ocr-lav
tesseract-ocr-lit
tesseract-ocr-ltz
tesseract-ocr-mal
tesseract-ocr-mar
tesseract-ocr-mkd
tesseract-ocr-mlt
tesseract-ocr-mon
tesseract-ocr-mri
tesseract-ocr-msa
tesseract-ocr-mya
tesseract-ocr-nep
tesseract-ocr-nld
tesseract-ocr-nor
tesseract-ocr-oci
tesseract-ocr-ori
tesseract-ocr-osd
tesseract-ocr-pan
tesseract-ocr-pol
tesseract-ocr-por
tesseract-ocr-pus
tesseract-ocr-que
tesseract-ocr-ron
tesseract-ocr-rus
tesseract-ocr-san
tesseract-ocr-sin
tesseract-ocr-slk
tesseract-ocr-slk_frak
tesseract-ocr-slv
tesseract-ocr-snd
tesseract-ocr-spa
tesseract-ocr-spa_old
tesseract-ocr-sqi
tesseract-ocr-srp
tesseract-ocr-srp_latn
tesseract-ocr-sun
tesseract-ocr-swa
tesseract-ocr-swe
tesseract-ocr-syr
tesseract-ocr-tam
tesseract-ocr-tat
tesseract-ocr-tel
tesseract-ocr-tgk
tesseract-ocr-tgl
tesseract-ocr-tha
tesseract-ocr-tir
tesseract-ocr-ton
tesseract-ocr-tur
tesseract-ocr-uig
tesseract-ocr-ukr
tesseract-ocr-urd
tesseract-ocr-uzb
tesseract-ocr-uzb_cyrl
tesseract-ocr-vie
tesseract-ocr-yid
tesseract-ocr-yor