block by palewire 766772

pdf2txt ripper from EveryBlock code.

pdf.py

"""
Utilities for reading data from PDF files.

Lifted from EveryBlock source code.

These require the pdftotext binary, available in the Xpdf package:
    http://www.foolabs.com/xpdf/download.html
"""

import os

PDFTOTEXT_BINARY = 'pdftotext'

def pdf_to_text(filename, keep_layout=True, raw=False):
    """
    Returns the text of the PDF with the given filename on the local filesystem.
    """
    if keep_layout and raw:
        raise ValueError('The "keep_layout" and "raw" arguments may not be used together')
    options = []
    if keep_layout:
        options.append('-layout')
    if raw:
        options.append('-raw')
    cmd = "%s %s '%s' -" % (PDFTOTEXT_BINARY, ' '.join(options), filename)
    return os.popen(cmd).read()

def pdfstring_to_text(pdf_string, keep_layout=True, raw=False):
    """
    Returns the text of the given PDF (provided as a string).
    """
    import os
    from tempfile import mkstemp
    fd, name = mkstemp()
    fp = os.fdopen(fd, 'wb')
    fp.write(pdf_string)
    fp.close()
    try:
        result = pdf_to_text(name, keep_layout, raw)
    finally:
        os.unlink(name)
    return result