Getting startedΒΆ

Attelo is mostly a parsing library with a couple of helper command line tools on the side.

The bulk of attelo usage goes through the API. Below is an example showing how you might run a simple attelo decoding cross-fold validation experiment (This is doc/quickstart.py in the attelo source tree)

"""
Example minature attelo evaluation for a dataset
"""

from __future__ import print_function

from os import path as fp
import os
import sys

from sklearn.linear_model import (LogisticRegression)

from attelo.decoding.mst import (MstDecoder,
                                 MstRootStrategy)
from attelo.decoding.util import (prediction_to_triples)

from attelo.learning.local import (SklearnAttachClassifier,
                                   SklearnLabelClassifier)
from attelo.parser.full import (JointPipeline)

from attelo.fold import (make_n_fold,
                         select_testing,
                         select_training)
from attelo.io import (load_multipack,
                       write_predictions_output)
from attelo.report import (CombinedReport,
                           EdgeReport)
from attelo.score import (score_edges)
from attelo.table import (DataPack)
from attelo.util import (mk_rng, Team)

# pylint: disable=invalid-name

WORKING_DIR = 'doc/example-corpus'
PREFIX = fp.join(WORKING_DIR, 'tiny')
TMP_OUTPUT = '/tmp/mini-evaluate'
if not fp.exists(TMP_OUTPUT):
    os.makedirs(TMP_OUTPUT)

# load the data
mpack = load_multipack(PREFIX + '.edus',
                       PREFIX + '.pairings',
                       PREFIX + '.features.sparse',
                       PREFIX + '.features.sparse.vocab',
                       verbose=True)

# divide the dataset into folds
num_folds = min((10, len(mpack)))
fold_dict = make_n_fold(mpack, num_folds, mk_rng())

# select a decoder and a learner team
decoder = MstDecoder(root_strategy=MstRootStrategy.fake_root)
learners = Team(attach=SklearnAttachClassifier(LogisticRegression()),
                label=SklearnLabelClassifier(LogisticRegression()))

# put them together as a parser
parser = JointPipeline(learner_attach=learners.attach,
                       learner_label=learners.label,
                       decoder=decoder)

# run cross-fold evaluation
scores = []
for fold in range(num_folds):
    print(">>> doing fold ", fold + 1, file=sys.stderr)
    print("training ... ", file=sys.stderr)
    # learn a model for the training data for this fold
    train_packs = select_training(mpack, fold_dict, fold).values()
    parser.fit(train_packs,
               [x.target for x in train_packs])

    fold_predictions = []
    # decode each document separately
    test_pack = select_testing(mpack, fold_dict, fold)
    for onedoc, dpack in test_pack.items():
        print("decoding on file : ", onedoc, file=sys.stderr)
        dpack = parser.transform(dpack)
        prediction = prediction_to_triples(dpack)
        # print("Predictions: ", prediction)
        # record the prediction score
        scores.append(score_edges(dpack, prediction))
        # optional: save the predictions for further inspection
        fold_predictions.extend(prediction)

    # optional: write predictions for this fold
    output_file = fp.join(TMP_OUTPUT, 'fold-%d' % fold)
    print("writing: %s" % output_file, file=sys.stderr)
    write_predictions_output(DataPack.vstack(test_pack.values()),
                             fold_predictions, output_file)

report = EdgeReport(scores)

# a combined report provides scores for multiple configurations
# here, we are only using it for the single config
combined_report = CombinedReport(EdgeReport,
                                 {('maxent', 'mst'): report})
print(combined_report.table())