Python 100 project #49: PDF to Text Converter

There are several file format which looks user friendly but it is difficult to digest in data process. One of them is PDF. It has lots of contents inside, hence it’s usually very tough to get information out of it programatically.

This time, I used Sophos XG Firewall Daily Executive Report PDF to retrieve Hardware info.

 

Output:

$ python soreader.py 
{"CPU Usage": [{"idle_average": 93.98}, {"idle_min": 28.36}]}
{"Memory Usage": [{"used_min": 1.53}, {"used_average": 1.59}, {"used_max": 1.66}, {"total": "1.95"}]}

 

Code:

from itertools import izip_longest
import json
import re

import pdftotext

PARAMS = [
    "CPU Usage", "Memory Usage", "Disk Usage"
    ]


def retrieve_data(pdf):
    result_dict = {}
    # Iterate over all the pages
    for page in pdf:
        first_sentence = re.match(r"(.*?)\n", page)
        if first_sentence:
            param = first_sentence.group(1)
            if param in PARAMS:
                matched_obj = re.search(r"\n((.*?)\n){3,4}", page)
                matched_txt = matched_obj.group(0)
                data_list = [ cell.split() for cell in matched_txt.split("\n") ]
                result_dict[param] = data_list
    
    return result_dict


def cpu_usage_idle_parser(table):
    # this function just return idle parameter(MIN, AVERAGE) only.
    for row in table:
        if 'Idle' in row:
            raw_data_list = row[row.index('Idle')+1:]
            data_list = list(map(lambda x: float(x[:-1]), raw_data_list))
            # remove largest data, which should be the MAX value
            data_list.remove(max(data_list))
            idle_average = max(data_list)
            idle_min = min(data_list)
            
            result = {'CPU Usage': [
                {'idle_average': idle_average},
                {'idle_min': idle_min},
                ]}
                
            return json.dumps(result)
    return None


def memory_usage_parser(table):
    # this function returns memory utilization(used: MIN, AVERAGE, MAX).
    for row in table:
        if 'Used' in row:
            data_list = []
            for column in row:
                try:
                    formated_data = float(column)
                    data_list.append(formated_data)
                except ValueError:
                    pass
            used_min, used_avg, used_max = sorted(data_list)
        elif 'Total' in row:
            total_memory = row[row.index('Total')+1]
        
    result = {'Memory Usage': [
        {'used_min': used_min},
        {'used_average': used_avg},
        {'used_max': used_max},
        {'total': total_memory},
        ]}
            
    return json.dumps(result)


if __name__ == "__main__":
    # Load PDF file
    with open("testreport.pdf", "rb") as f:
        pdf = pdftotext.PDF(f)

    data = retrieve_data(pdf)
    
    print(cpu_usage_idle_parser(data['CPU Usage']))
    
    print(memory_usage_parser(data['Memory Usage']))