Source code for reader

#!/usr/bin/env python
# coding: utf8

from __future__ import (unicode_literals, absolute_import, division, print_function)

from threading import Thread
from Queue import Queue

import display as d

import io
import re
import sys

import datetime
import time


[docs]class HTTPFormatError(Exception): """Raised when the HTTP access log line is not recognized""" pass
[docs]class LogReader(Thread): """ This thread object reads the given log file, and by default parses its lines. Then it sends them in a queue that is read by the Statistician. To read the file, the LogReader read the lines until the EOF, then waits for a given time: ``sleeping_time``. Attributes ---------- log_path: string The path to the log file. The program is terminated if the log file cannot be opened. sleeping_time: float The time in second during which the program will sleep after the EOF. parse: bool If True, the LogReader will parse the read line with the :func:`parse` function before it puts it in the Queue. If False, the Statistician will have to parse it itself. total_nb_of_line_read: int Counts the number of lines that have been read since the beginning, including the empty and commented lines. should_run: bool If False, the thread will shortly end stop its operation. Used to cleanly end the program. output_queue: Queue The queue where the read lines will be put. name: string The name of the thread: 'log reader thread' """ def __init__(self, log_path, sleeping_time=0.1, parse=True): Thread.__init__(self) self.log_path = log_path self.sleeping_time = sleeping_time self.parse = parse self.total_nb_of_line_read = 0 self.should_run = True self.output_queue = Queue() self.name = 'log reader thread'
[docs] def run(self): """ Opens the input log file at ``log_path``, goes to the EOF, then try to read new lines. If new lines are detected, sends them to the ``output_queue`` for the ``Statistician`` (parsed or not parsed depending of ``self.parse``). When EOF, waits for ``sleeping_time`` and starts again. Note ---- There are two printing systems: sys1 and sys2, used to send log messages * sys1 is used to print WARNING log messages when the LogReader is too slow: last_EOF is big * sys2 is used to print DEBUG log messages with the number of line read every second """ # opens the file try: log_file = io.open(self.log_path, 'rt') except IOError: d.displayer.log(self, d.LogLevel.CRITICAL, "wrong path for the input log '{}'".format(self.log_path)) from thread import interrupt_main interrupt_main() sys.exit() # goes to the EOF log_file.seek(0, io.SEEK_END) d.displayer.log(self, d.LogLevel.DEBUG, "At EOF, ready for reading") # print sys2: used to print DEBUG log messages with the number of line read every second last_printed_time = time.time() since_last_printed = 0 # last_EOF counts the number of lines since the last EOF last_EOF = 0 last_EOF_time = time.time() # EOF_reached_printed is used for log messages EOF_reached_printed = True # because we start at EOF # this is used to avoid a first sleeping time when the program starts and we are at EOF fist_reading_loop = True while self.should_run: EOF = False while not EOF: line = log_file.readline() if not line: EOF = True # sys1 EOF info: used to print WARNING log messages when the LogReader is too slow: last_EOF is big last_EOF = 0 last_EOF_time = time.time() else: self.total_nb_of_line_read += 1 EOF_reached_printed = False # sys1 EOF info last_EOF += 1 if last_EOF % 5000 == 0: if time.time() - last_EOF_time > 1: d.displayer.log(self, d.LogLevel.WARNING, 'last EOF {} lines ago, {:.02f}s ago' ''.format(last_EOF, time.time() - last_EOF_time)) # sys2 EOF info since_last_printed += 1 if since_last_printed % 100 == 0: delta = time.time() - last_printed_time if delta > 1: d.displayer.log(self, d.LogLevel.DEBUG, '{} lines parsed last {}s'.format(since_last_printed, delta)) last_printed_time = time.time() since_last_printed = 0 # lines are strip-ed, ie white spaces around the line are removed line = line.strip() # Only non-empty and non-commented lines are sent to the queue if not line.startswith('#') and len(line) > 0: if self.parse: try: self.output_queue.put(parse_line(line)) except HTTPFormatError as e: d.displayer.log(self, d.LogLevel.ERROR, e.message) else: self.output_queue.put(line) if not EOF_reached_printed: d.displayer.log(self, d.LogLevel.INFO, 'Log EOF reached after {} lines' ''.format(self.total_nb_of_line_read)) EOF_reached_printed = True if not fist_reading_loop: # next instruction is commented because it spams the log... # d.displayer.log(self, d.LogLevel.DEBUG, 'Sleep for {}s'.format(self.sleeping_time)) time.sleep(self.sleeping_time) fist_reading_loop = False log_file.close()
[docs] def state(self): """ Returns ------- string Describes the present thread state """ return 'total nb of read line: {}' \ ''.format(self.total_nb_of_line_read)
[docs]def parse_line(line, parse_date=False): """Parse a HTTP w3c formatted line and return a dictionary with the following keys: ``'remote_host', 'remote_log_name', 'auth_user', 'date', 'request', 'status', 'bytes'`` Note ---- * ``status`` and ``bytes`` are converted to ``int`` * ``date`` can be converted in a datetime object, UTC-time, but by default the conversion is disable (it is slow) Raises ------ HTTPFormatError """ # 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 parsed = re.match(r'^(?P<remote_host>\S*)\s*(?P<remote_log_name>\S*)\s*(?P<auth_user>\S*)\s*\[(?P<date>.*?)\]' r'\s*\"(?P<request>.*)\"\s*(?P<status>\d*)\s*(?P<bytes>\d*)$', line.strip()) if parsed is None: raise HTTPFormatError('incorrect HTTP format in line: {}'.format(line)) HTTP_dict = parsed.groupdict() try: HTTP_dict['status'], HTTP_dict['bytes'] = int(HTTP_dict['status']), int(HTTP_dict['bytes']) if parse_date: # the date is transformed in a datetime.datetime object date = HTTP_dict['date'] # the used of a delta is necessary to get real utc time because '%z' doesn't work in python<3.2! delta = datetime.timedelta(hours=int(date[-5:]) / 100) HTTP_dict['date'] = datetime.datetime.strptime(date[:-6], '%d/%b/%Y:%X') - delta except: raise HTTPFormatError('incorrect HTTP format in line: {}'.format(line)) return HTTP_dict
[docs]def get_section(request): """ Return the section name from a HTTP request, or None if not a proper HTTP request Examples -------- * ``GET /test/index/ HTTP`` => ``/test`` * ``GET /te.st/index/ HTTP`` => ``/te.st`` * ``GET /test/index.html HTTP`` => ``/test`` * ``GET /test HTTP`` => ``/testv * ``GET /test.html HTTP`` => ``/`` * ``GET / HTTP`` => ``/`` """ # section = re.match(r'^\S+\s+(/[^/ ]*)', request.strip()) section = re.match(r'^\S+\s+(/[^ /]*)', request.strip()) if section is not None: if '.' in section.group(1) and request.count('/') == 1: return '/' return section.group(1) return None
if __name__ == '__main__': # ===== HTTP access log line parser ===== print(parse_line('127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326')) print(get_section("GET /section123/image.png HTTP/1.1")) # ===== get_section() tests ===== string_="GET /test/index/ HTTP"; print(string_, " => ", get_section(string_)) string_="GET /te.st/index/ HTTP"; print(string_, " => ", get_section(string_)) string_="GET /test/index.html HTTP"; print(string_, " => ", get_section(string_)) string_="GET /test HTTP"; print(string_, " => ", get_section(string_)) string_="GET /test.html HTTP"; print(string_, " => ", get_section(string_)) string_="GET / HTTP"; print(string_, " => ", get_section(string_)) # might be useful to know the encoding # ==================================== import locale print('encoding', locale.getpreferredencoding()) # date correction process : %z doesn't work in python<3.2 # ======================================================= date = '10/Oct/2000:13:55:36 -0700' delta = datetime.timedelta(hours=int(date[-5:]) / 100) dt = datetime.datetime.strptime(date[:-6], '%d/%b/%Y:%X') - delta print('date: ', dt) # test the LogReader # ================== picasso = d.Displayer(debug=True) time.sleep(0.1) rth = LogReader('../log/simulated_log') rth.start() time.sleep(1) rth.should_run = False