Fares 7 months ago
commit
3bd4e7f218
4 changed files with 80 additions and 0 deletions
  1. 17 0
      Dockerfile
  2. 51 0
      app.py
  3. 10 0
      docker-compose.yml
  4. 2 0
      requirements.txt

+ 17 - 0
Dockerfile

@@ -0,0 +1,17 @@
+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the current directory contents into the container at /app
+COPY . /app
+
+# Install any needed packages specified in requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Make port 5000 available to the world outside this container
+EXPOSE 5000
+
+# Run app.py when the container launches
+CMD ["flask", "run", "--host=0.0.0.0"]

+ 51 - 0
app.py

@@ -0,0 +1,51 @@
+from flask import Flask, request, jsonify
+import fitz  # PyMuPDF
+import os
+
+app = Flask(__name__)
+
+# Define the upload folder
+UPLOAD_FOLDER = 'uploads'
+if not os.path.exists(UPLOAD_FOLDER):
+    os.makedirs(UPLOAD_FOLDER)
+
+@app.route('/upload', methods=['POST'])
+def upload_file():
+    # Check if the post request has the file part
+    if 'file' not in request.files:
+        return jsonify({'error': 'No file part in the request'}), 400
+
+    file = request.files['file']
+
+    # If the user does not select a file, the browser submits an
+    # empty file without a filename.
+    if file.filename == '':
+        return jsonify({'error': 'No selected file'}), 400
+
+    if file and allowed_file(file.filename):
+        filepath = os.path.join(UPLOAD_FOLDER, file.filename)
+        file.save(filepath)
+
+        # Extract text from the PDF
+        text = extract_text_from_pdf(filepath)
+
+        # Optionally, remove the file after processing
+        os.remove(filepath)
+
+        return jsonify({'text': text}), 200
+    else:
+        return jsonify({'error': 'Invalid file type. Only PDF files are allowed.'}), 400
+
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() == 'pdf'
+
+def extract_text_from_pdf(filepath):
+    document = fitz.open(filepath)
+    text = ""
+    for page_num in range(len(document)):
+        page = document.load_page(page_num)
+        text += page.get_text("text")
+    return text
+
+if __name__ == '__main__':
+    app.run(debug=True)

+ 10 - 0
docker-compose.yml

@@ -0,0 +1,10 @@
+services:
+  pdf2txt:
+    build: .
+    ports:
+      - "5000:5000"
+    volumes:
+      - .:/app
+    environment:
+      - FLASK_APP=app.py
+      - FLASK_ENV=development

+ 2 - 0
requirements.txt

@@ -0,0 +1,2 @@
+Flask==2.0.3
+pymupdf==1.18.10