# geolite2.py: Functions enabling querying GeoLite2 CSV
# Copyright (C) 2018 Libor Polčák <ipolcak@fit.vutbr.cz>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import collections
import csv
import os

from IPy import IP

from time_parser import FormatTimeWrapper

class geolite2_accessor():

    class db_info():
        def __init__(self):
            self.last = 0
            self.dirs = []
            self.cache = {}

    def __init__(self, path):
        self.__geolite2_dir = path
        self.__geolite2_dbs = {
                "ASN": geolite2_accessor.db_info(),
                "City": geolite2_accessor.db_info(),
                }
        self.__search_func = {
                "ASN": self.get_geolite2_as,
                "City": self.get_geolite2_geolocation,
                }

    def get_data(self, ipaddr, geolite2_db, start, end):
        """ Searches for information about an IP address.

        ipaddr The IP address to be searched for (IPv4/IPv6)
        geolite2_db ASN or City
        start, end Time frames during which the search is made.

        Returns a list of two-tuples. Each tuple consists of a timestamp and a
        dictionary with known information. The list is empty if no information
        is available.

        Note that the function also returns one entry after the end of the interval.
        If there is a change comparing the last entry before end_time, it is possible
        that it happened during the interval.
        """
        search_dirs = self.__get_directories(geolite2_db, start, end)
        retlist = []
        for t, dirname in search_dirs:
            preprocessed_dir = "%s/%s" % (self.__geolite2_dir, dirname)
            try:
                d = self.__geolite2_dbs[geolite2_db].cache[(ipaddr, preprocessed_dir)]
            except:
                d = self.__search_func[geolite2_db](ipaddr, preprocessed_dir)
            if d:
                retlist.append((t, d))
        return retlist

    @staticmethod
    def __get_preprocessed_filename_ipaddr(geolite2_dir, ip_addr):
        addr_wrapper = IP(ip_addr)
        basedir = "%s/ipv%d/" % (geolite2_dir, addr_wrapper.version())
        sep = "." if addr_wrapper.version() == 4 else ":"
        groups = ip_addr.split(sep)
        subdirs = ""
        subdirs_candidate = groups.pop(0)
        while os.path.exists(basedir + subdirs_candidate):
            subdirs = subdirs_candidate
            subdirs_candidate += "/%s" % groups.pop(0)
        return "%s/%s/networks" % (basedir, subdirs)

    @staticmethod
    def __get_preprocessed_filename_location(geolite2_dir, geoname_id):
        basedir = "%s/locations/" % (geolite2_dir, )
        geoname_id_len = len(geoname_id)
        basedir = "%s/%d/" % (basedir, geoname_id_len)
        pos = 0
        subdirs_candidate = geoname_id[pos:pos+2]
        while os.path.exists(basedir + subdirs_candidate):
            basedir = "%s/%s/" % (basedir, subdirs_candidate)
            pos += 2
            subdirs_candidate = geoname_id[pos:pos+2]
        return "%s/locations" % (basedir, )

    @staticmethod
    def __get_geolite2_row(matching_func, fname):
        """ Opens a CSV file and searches for a row matching the input_str.
    
        matching_func Allows definition of matching, e.g. for performance reasons.
                      The function should expect one parameter - the first item
                      in the row and it should return True if the match is successful.
        geolite2_dir The path to the geolite2 directory containing the CSVs.
        geolite2_type The CSV name like City-Blocks-IPv4, City-Locations-en, etc.
    
        Returns None if not successfull.
        """
        with open(fname, encoding='utf-8') as csv_file:
            next(csv_file) # Skip the initail line (column names)
            parser = csv.reader(csv_file, delimiter=',', quotechar='"', lineterminator='\n')
            for row in parser:
                if matching_func(row[0]):
                    return row

    @staticmethod
    def __get_blocks_matching_func(ip_addr):
        """ Returns matching function suitable for get_geolite2_row(). """
        def ipv4_beginning(ip_addr):
            return ip_addr.split(".")[0] + "."
        def ipv6_beginning(ip_addr):
            return ip_addr.split(":")[0] + ":" # cut -d"," GeoLite2-ASN-Blocks-IPv6.csv -f 1 | cut -d"/" -f 2 | sort --numeric | less
        beginning = {
                4: ipv4_beginning,
                6: ipv6_beginning,
            }
        ip = IP(ip_addr)
        ip_beginning = beginning[ip.version()](ip_addr)
        def blocks_matching_func(item):
            if item.startswith(ip_beginning):
                network = IP(item)
                return ip in network
            else:
                return False
        return blocks_matching_func

    @staticmethod
    def create_ordered_dict_base():
        return collections.OrderedDict({"geolocation source":
                """This product includes GeoLite2 data created by MaxMind, available from <a href="http://www.maxmind.com">http://www.maxmind.com</a>."""})

    def get_geolite2_geolocation(self, ip_addr, geolite2_dir):
        """ Returns geolocation data about the IP address
    
        ip_addr The IP address to be searched for
        geolite2_dir The directory with the GeoLite2 CSVs
    
        Returns None if not successfull.
        """
        row = geolite2_accessor.__get_geolite2_row(geolite2_accessor.__get_blocks_matching_func(ip_addr),
                geolite2_accessor.__get_preprocessed_filename_ipaddr(geolite2_dir, ip_addr))
        if row:
            # It is required to include the following line
            ret_dict = self.create_ordered_dict_base()
            ret_dict["network"] = row[0]
            geoname_id = row[1]
            location_row = self.__get_geolite2_row(lambda n: n == geoname_id,
                    geolite2_accessor.__get_preprocessed_filename_location(geolite2_dir, geoname_id))
            if location_row:
                for name, index in (
                        ("continent", 3),
                        ("country code", 4),
                        ("country", 5),
                        ("country part", 7),
                        ("part of country part", 9),
                        ("city", 10),
                        ("time zone", 12),
                        ):
                    value = location_row[index]
                    if value:
                        ret_dict[name] = value
                if len(location_row) >= 14:
                    ret_dict["inside EU"] = str(True) if location_row[13] == "1" else str(False)
            self.__geolite2_dbs["City"].cache[(ip_addr, geolite2_dir)] = ret_dict
            return ret_dict

    def get_geolite2_as(self, ip_addr, geolite2_dir):
        """ Returns data about the AS of the IP address.

        ip_addr The IP address to be searched for
        geolite2_dir The directory with the GeoLite2 CSVs

        Returns None if not successfull.
        """
        row = geolite2_accessor.__get_geolite2_row(geolite2_accessor.__get_blocks_matching_func(ip_addr),
                geolite2_accessor.__get_preprocessed_filename_ipaddr(geolite2_dir, ip_addr))
        if row:
            # It is required to include the following line
            ret_dict = self.create_ordered_dict_base()
            ret_dict["AS network"] = row[0]
            ret_dict["AS number"] = row[1]
            ret_dict["AS organization"] = row[2]
            self.__geolite2_dbs["ASN"].cache[(ip_addr, geolite2_dir)] = ret_dict
            return ret_dict

    def __get_directories(self, geolite2_db, start_time, end_time):
        """ Returns a list of two tuples containing directories with GeoLite2 CSVs.
    
        Each item in the returned list is a two tuple: (1) the timestamp and
        (2) the subdirectory name.
    
        geolite2_db Supported City or ASN.
        start_time Unix timestamp specifying the start of the interval. 
        end_time Unix timestamp specifying the end of the interval.
    
        Note that the function also returns one entry after the end of the interval.
        If there is a change comparing the last entry before end_time, it is possible
        that it happened during the interval
        """
        retlist = []
        db = self.__geolite2_dbs[geolite2_db]
        if db.last < end_time:
            # Try to update cache
            entries = {e.name: e for e in os.scandir(self.__geolite2_dir)}
            names = list(entries.keys())
            names.sort()
            for entry in names:
                components = entry.split("_")
                if components[0] == "GeoLite2-%s-CSV" % geolite2_db and entries[entry].is_dir():
                    t = FormatTimeWrapper(components[1], "%Y%m%d").get()
                    if t > db.last:
                        db.dirs.append((t, entry))
                        self.__geolite2_dbs[geolite2_db].last = t
        # Look to the cache
        for item in db.dirs:
            t, _ = item
            if t <= start_time:
                retlist = [item]
            elif start_time < t <= end_time:
                retlist.append(item)
            elif t > end_time and retlist: # Note that we Add one more item to the nonempty list
                retlist.append(item)
            if t >= end_time:
                break # It does not make sense to waste CPU with additional entries
        return retlist

# Argument handling
def process_args():
    import argparse
    parser = argparse.ArgumentParser(description="MaxMind database search")
    parser.add_argument("geolite2dir", help="The directory with the Geolite data.")
    parser.add_argument("time", help="Time for which to perform the search.")
    parser.add_argument("ipaddr", help="The ip address to be searched in the CSVs.")
    return parser.parse_args()

if __name__ == "__main__":
    args = process_args()
    gl = geolite2_accessor(args.geolite2dir)
    from time_parser import TimeWrapper
    t = TimeWrapper(args.time).get()
    print(gl.get_data(args.ipaddr, "City", t, t))
    print(gl.get_data(args.ipaddr, "ASN", t, t))
