How to Build Web Document Scanner Using OpenCV-Python

Many excellent document mobile apps support not only image capture, but also edge detection and perspective transformation. If you are interested in these computer vision technologies, you can use OpenCV to create a free document scanner app yourself. In this post, I want to share how to use OpenCV-Python to create a web document scanner step by step.

Setting Up Environment

Download Python 3.5.

Install Flask:

pip3 install flask

Install OpenCV 3.3.0 for Python:

pip3 install opencv-python

Download the latest NumPy 1.11.2. Unzip the package and build it:

python3 setup.py build install

To compile NumPy source code on Windows 10, install Microsoft Visual C++ Compiler for Python 2.7.

Web Document Scanner

Article and Code References

Steps of Building the App

Create document.py to do edge detection and perspective transformation:

import cv2
import rect
import numpy as np

class Scanner(object):
    # http://www.pyimagesearch.com/2014/08/25/4-point-opencv-getperspective-transform-example/
    def four_point_transform(self, image, rect):
    	# obtain a consistent order of the points and unpack them
        # individually
        (tl, tr, br, bl) = rect
    
        # compute the width of the new image, which will be the
        # maximum distance between bottom-right and bottom-left
        # x-coordiates or the top-right and top-left x-coordinates
        widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
        widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
        maxWidth = max(int(widthA), int(widthB))
    
        # compute the height of the new image, which will be the
        # maximum distance between the top-right and bottom-right
        # y-coordinates or the top-left and bottom-left y-coordinates
        heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
        heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
        maxHeight = max(int(heightA), int(heightB))
    
        # now that we have the dimensions of the new image, construct
        # the set of destination points to obtain a "birds eye view",
        # (i.e. top-down view) of the image, again specifying points
        # in the top-left, top-right, bottom-right, and bottom-left
        # order
        dst = np.array([
            [0, 0],
            [maxWidth - 1, 0],
            [maxWidth - 1, maxHeight - 1],
            [0, maxHeight - 1]], dtype = "float32")
    
        # compute the perspective transform matrix and then apply it
        M = cv2.getPerspectiveTransform(rect, dst)
        warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
    
        # return the warped image
        return warped

    # https://github.com/vipul-sharma20/document-scanner
    def detect_edge(self, image, enabled_transform = False):
        dst = None
        orig = image.copy()

        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        blurred = cv2.GaussianBlur(gray, (5, 5), 0)
        edged = cv2.Canny(blurred, 0, 20)
        _, contours, _ = cv2.findContours(edged, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)

        contours = sorted(contours, key=cv2.contourArea, reverse=True)

        for cnt in contours:
            epsilon = 0.051 * cv2.arcLength(cnt, True)
            approx = cv2.approxPolyDP(cnt, epsilon, True)

            if len(approx) == 4:
                target = approx
                cv2.drawContours(image, [target], -1, (0, 255, 0), 2)

                if enabled_transform:
                    approx = rect.rectify(target)
                    # pts2 = np.float32([[0,0],[800,0],[800,800],[0,800]])
                    # M = cv2.getPerspectiveTransform(approx,pts2)
                    # dst = cv2.warpPerspective(orig,M,(800,800))
                    dst = self.four_point_transform(orig, approx)
                break

        return image, dst

Create camera.py to capture frames from a camera:

import cv2
from document import Scanner

class VideoCamera(object):
    def __init__(self):
        # Open a camera
        self.cap = cv2.VideoCapture(2)
      
        # Initialize video recording environment
        self.is_record = False
        self.out = None
        self.transformed_frame = None

        self.scanner = Scanner()
        self.cached_frame = None
    
    def __del__(self):
        self.cap.release()

    def get_video_frame(self):
        ret, frame = self.cap.read()
        if ret:
            frame, _ = self.scanner.detect_edge(frame)
            self.cached_frame = frame
            ret, jpeg = cv2.imencode('.jpg', frame)
            return jpeg.tobytes()
        else:
            return None

    def capture_frame(self):
        ret, frame = self.cap.read()
        if ret:
            _, frame = self.scanner.detect_edge(frame, True)
            ret, jpeg = cv2.imencode('.jpg', frame)
            self.transformed_frame = jpeg.tobytes()
        else:
            return None

    def get_cached_frame(self):
        return self.cached_frame

    def get_image_frame(self):
        return self.transformed_frame

Note: if you have only one device connected, the parameter in cv2.VideoCapture() should be 0.

Create server.py to stream camera frames to your web client:

from flask import Flask, render_template, Response, jsonify, request
from camera import VideoCamera

app = Flask(__name__)

video_camera = None

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/capture_status', methods=['POST'])
def capture_status():
    global video_camera 

    if video_camera == None:
        video_camera = VideoCamera()

    json = request.get_json()

    status = json['status']

    if status == "true":
        video_camera.capture_frame()
        return jsonify(result="done")

def video_frame():
    global video_camera 

    if video_camera == None:
        video_camera = VideoCamera()
        
    while True:
        frame = video_camera.get_video_frame()

        if frame is not None:
            yield (b'--frame\r\n'
                    b'Content-Type: image/jpeg\r\n\r\n' + frame + b'\r\n\r\n')
        else:
            yield (b'--frame\r\n'
                    b'Content-Type: image/jpeg\r\n\r\n' + video_camera.get_cached_frame() + b'\r\n\r\n')

def image_frame():
    global video_camera 

    if video_camera == None:
        video_camera = VideoCamera()
        
    frame = video_camera.get_image_frame()

    if frame is not None:
        yield (b'--frame\r\n'
                b'Content-Type: image/jpeg\r\n\r\n' + frame + b'\r\n\r\n')

@app.route('/video_viewer')
def video_viewer():
    return Response(video_frame(),
                    mimetype='multipart/x-mixed-replace; boundary=frame')

@app.route('/image_viewer')
def image_viewer():
    return Response(image_frame(),
                        mimetype='multipart/x-mixed-replace; boundary=frame')

if __name__ == '__main__':
    app.run(host='0.0.0.0', threaded=True)

Run the app: