Source code for ABXpy.database.database

# make sure the rest of the ABXpy package is accessible
import os
import sys
package_path = os.path.dirname(
    os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
if not(package_path in sys.path):
    sys.path.append(package_path)
# remove this dependency to ABXpy and create separate repository for this ?

import pandas
import numpy
import ABXpy.misc.tinytree as tinytree


#FIXME use just one isolated | as a separator instead of two #

# custom read_table that ignore empty entries at the end of a file (they
# can result from trailing white spaces at the end for example)
def read_table(filename):
[docs]    db = pandas.read_table(filename, sep='[ \t]+')
    # removes row with all null values (None or NaN...)
    db = db.dropna(how='all')
    return db

# function that loads a database


def load(filename, features_info=False):
[docs]    # reading the main database using pandas (it is now a DataFrame)
    ext = '.item'
    if not(filename[len(filename) - len(ext):] == ext):
        filename = filename + ext
    db = read_table(filename)

    # finding '#' (to separate location info from attribute info) and fixing
    # names of columns
    columns = db.columns.tolist()
    l = []
    for i, c in enumerate(columns):
        if c[0] == "#":
            l.append(i)
            columns[i] = c[1:]
    db.columns = pandas.Index(columns)
    assert len(l) > 0 and l[
        0] == 0, ('The first column name in the database main file must be '
                  'prefixed with # (sharp)')
    assert len(
        l) == 2, ('Exactly two column names in the database main file must be'
                  ' prefixed with a # (sharp)')
    feat_db = db[db.columns[:l[1]]]
    db = db[db.columns[l[1]:]]
    # verbose print("  Read input File '"+filename+"'. Defined conditions:
    # "+str(newcolumns[attrI:len(columns)]))

    # opening up existing auxiliary files, and merging with the main database
    # and creating a forest describing the hierarchy at the same time (useful
    # for optimizing regressor generation and filtering)

    (basename, _) = os.path.splitext(filename)
    db, db_hierarchy = load_aux_dbs(basename, db, db.columns, filename)

    # dealing with missing items: for now rows with missing items are dropped
    nanrows = numpy.any(pandas.isnull(db), 1)
    if any(nanrows):
        dropped = db[nanrows]
        dropped.to_csv(basename + '-removed' + ext)
        db = db[~ nanrows]
    feat_db = feat_db[~ nanrows]
    # not so verbose print('** Warning ** ' + len(nanrows) + ' items were
    # removed because of missing information. The removed items are listed in'
    # + basename + '-removed.item'
    if features_info:
        return db, db_hierarchy, feat_db
    else:
        return db, db_hierarchy


# recursive auxiliary function for loading the auxiliary databases
def load_aux_dbs(basename, db, cols, mainfile):
[docs]    forest = [tinytree.Tree() for col in cols]
    for i, col in enumerate(cols):
        forest[i].name = col
        try:
            auxfile = basename + '.' + col
            if not(auxfile == mainfile):
                auxdb = read_table(auxfile)
                assert col == auxdb.columns[0], (
                    'First column name in file %s'
                    ' is %s. It should be %s instead.' % (
                        auxfile, auxdb.columns[0], col))
                # call get_aux_dbs on child columns
                auxdb, auxforest = load_aux_dbs(
                    basename, auxdb, auxdb.columns[1:])
                # add to forest
                forest[i].addChildrenFromList(auxforest)
                # merging the databases
                db = pandas.merge(db, auxdb, on=col, how='left')
        # verbose print("  Read auxiliary File '"+auxfile+"'. Defined
        # conditions: "+str(newcol[1:len(newcol)])+" on key '"+newcol[0]+"'")
        except IOError:
            pass
    return db, forest