/*
 * Decompiled with CFR 0.152.
 */
package opennlp.tools.formats;

import java.io.File;
import java.io.IOException;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.cmdline.params.EncodingParameter;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.formats.AbstractSampleStreamFactory;
import opennlp.tools.formats.TwentyNewsgroupSampleStream;
import opennlp.tools.tokenize.AbstractTokenizer;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.ObjectStream;

public class TwentyNewsgroupSampleStreamFactory<P>
extends AbstractSampleStreamFactory<DocumentSample, P> {
    public static void registerFactory() {
        StreamFactoryRegistry.registerFactory(DocumentSample.class, "20newsgroup", new TwentyNewsgroupSampleStreamFactory<Parameters>(Parameters.class));
    }

    protected TwentyNewsgroupSampleStreamFactory(Class<P> params) {
        super(params);
    }

    @Override
    public ObjectStream<DocumentSample> create(String[] args) {
        Parameters params = ArgumentParser.parse(args, Parameters.class);
        AbstractTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
        if (params.getTokenizerModel() != null) {
            try {
                tokenizer = new TokenizerME(new TokenizerModel(params.getTokenizerModel()));
            }
            catch (IOException e) {
                throw new TerminateToolException(-1, "Failed to load tokenizer model!", e);
            }
        } else if (params.getRuleBasedTokenizer() != null) {
            String tokenizerName = params.getRuleBasedTokenizer();
            if ("simple".equals(tokenizerName)) {
                tokenizer = SimpleTokenizer.INSTANCE;
            } else if ("whitespace".equals(tokenizerName)) {
                tokenizer = WhitespaceTokenizer.INSTANCE;
            } else {
                throw new TerminateToolException(-1, "Unknown tokenizer: " + tokenizerName);
            }
        }
        try {
            return new TwentyNewsgroupSampleStream(tokenizer, params.getDataDir().toPath());
        }
        catch (IOException e) {
            throw new TerminateToolException(-1, "IO error while opening sample data: " + e.getMessage(), e);
        }
    }

    static interface Parameters
    extends EncodingParameter {
        @ArgumentParser.ParameterDescription(valueName="dataDir", description="dir containing the 20newsgroup folders")
        public File getDataDir();

        @ArgumentParser.ParameterDescription(valueName="modelFile")
        @ArgumentParser.OptionalParameter
        public File getTokenizerModel();

        @ArgumentParser.ParameterDescription(valueName="name")
        @ArgumentParser.OptionalParameter
        public String getRuleBasedTokenizer();
    }
}

