Python 100 project #49: PDF to Text Converter

There are several file format which looks user friendly but it is difficult to digest in data process. One of them is PDF. It has lots of contents inside, hence it’s usually very tough to get information out of it programatically.

This time, I used Sophos XG Firewall Daily Executive Report PDF to retrieve Hardware info.



$ python 
{"CPU Usage": [{"idle_average": 93.98}, {"idle_min": 28.36}]}
{"Memory Usage": [{"used_min": 1.53}, {"used_average": 1.59}, {"used_max": 1.66}, {"total": "1.95"}]}



from itertools import izip_longest
import json
import re

import pdftotext

    "CPU Usage", "Memory Usage", "Disk Usage"

def retrieve_data(pdf):
    result_dict = {}
    # Iterate over all the pages
    for page in pdf:
        first_sentence = re.match(r"(.*?)\n", page)
        if first_sentence:
            param =
            if param in PARAMS:
                matched_obj ="\n((.*?)\n){3,4}", page)
                matched_txt =
                data_list = [ cell.split() for cell in matched_txt.split("\n") ]
                result_dict[param] = data_list
    return result_dict

def cpu_usage_idle_parser(table):
    # this function just return idle parameter(MIN, AVERAGE) only.
    for row in table:
        if 'Idle' in row:
            raw_data_list = row[row.index('Idle')+1:]
            data_list = list(map(lambda x: float(x[:-1]), raw_data_list))
            # remove largest data, which should be the MAX value
            idle_average = max(data_list)
            idle_min = min(data_list)
            result = {'CPU Usage': [
                {'idle_average': idle_average},
                {'idle_min': idle_min},
            return json.dumps(result)
    return None

def memory_usage_parser(table):
    # this function returns memory utilization(used: MIN, AVERAGE, MAX).
    for row in table:
        if 'Used' in row:
            data_list = []
            for column in row:
                    formated_data = float(column)
                except ValueError:
            used_min, used_avg, used_max = sorted(data_list)
        elif 'Total' in row:
            total_memory = row[row.index('Total')+1]
    result = {'Memory Usage': [
        {'used_min': used_min},
        {'used_average': used_avg},
        {'used_max': used_max},
        {'total': total_memory},
    return json.dumps(result)

if __name__ == "__main__":
    # Load PDF file
    with open("testreport.pdf", "rb") as f:
        pdf = pdftotext.PDF(f)

    data = retrieve_data(pdf)
    print(cpu_usage_idle_parser(data['CPU Usage']))
    print(memory_usage_parser(data['Memory Usage']))