package ch.akuhn.hapax;

import ch.akuhn.hapax.corpus.CamelCaseScanner;
import ch.akuhn.hapax.corpus.LetterScanner;
import ch.akuhn.hapax.corpus.TermScanner;
import ch.akuhn.hapax.corpus.WordScanner;
import ch.akuhn.hapax.index.GlobalWeighting;
import ch.akuhn.hapax.index.LocalWeighting;
import ch.akuhn.hapax.index.TermDocumentMatrix;
import ch.akuhn.util.Files;
import java.io.File;
import java.io.InputStream;

/* loaded from: input_file:ch/akuhn/hapax/CorpusBuilder.class */
public final class CorpusBuilder {
    TermDocumentMatrix corpus;
    TermScanner scanner;
    private LocalWeighting local;
    private GlobalWeighting global;
    private boolean rejectStopwords;
    private boolean rejectRareTerms;
    private boolean rejectCommonTerms;
    boolean ignoreCase;
    int latentDimensions;

    public CorpusBuilder(TermDocumentMatrix termDocumentMatrix) {
        this.scanner = new WordScanner();
        this.local = LocalWeighting.NULL;
        this.global = GlobalWeighting.NULL;
        this.rejectStopwords = true;
        this.rejectRareTerms = true;
        this.rejectCommonTerms = true;
        this.ignoreCase = true;
        this.latentDimensions = 25;
        this.corpus = termDocumentMatrix;
    }

    public CorpusBuilder() {
        this(new TermDocumentMatrix());
    }

    public CorpusBuilder addDocument(String str, String str2) {
        this.corpus.putDocument(str, this.scanner.fromString(str2));
        return this;
    }

    public CorpusBuilder addFiles(String str, String... strArr) {
        for (File file : Files.find(str, strArr)) {
            this.corpus.putDocument(file.getAbsolutePath(), this.scanner.fromFile(file));
        }
        return this;
    }

    public CorpusBuilder dontUseWeighting() {
        this.local = LocalWeighting.NULL;
        this.global = GlobalWeighting.NULL;
        return this;
    }

    public TermDocumentMatrix makeTDM() {
        TermDocumentMatrix termDocumentMatrix = this.corpus;
        if (this.ignoreCase) {
            termDocumentMatrix = termDocumentMatrix.toLowerCase();
        }
        if (this.rejectRareTerms) {
            termDocumentMatrix = termDocumentMatrix.rejectHapaxes();
        }
        if (this.rejectStopwords) {
            termDocumentMatrix = termDocumentMatrix.rejectStopwords();
        }
        return termDocumentMatrix.weight(this.local, this.global);
    }

    public CorpusBuilder rejectCommonTerms() {
        this.rejectCommonTerms = true;
        return this;
    }

    public CorpusBuilder rejectRareTerms() {
        this.rejectRareTerms = true;
        return this;
    }

    public CorpusBuilder rejectStopwords() {
        this.rejectStopwords = true;
        return this;
    }

    public CorpusBuilder useCamelCaseScanner() {
        this.scanner = new CamelCaseScanner();
        return this;
    }

    public CorpusBuilder useTFIDF() {
        this.local = LocalWeighting.TERM;
        this.global = GlobalWeighting.IDF;
        return this;
    }

    public CorpusBuilder useWordScanner() {
        this.scanner = new WordScanner();
        return this;
    }

    public CorpusBuilder useLetterScanner() {
        this.scanner = new LetterScanner();
        return this;
    }

    public CorpusBuilder beCaseSensitiv() {
        this.ignoreCase = false;
        return this;
    }

    public CorpusBuilder ignoreCase() {
        this.ignoreCase = true;
        return this;
    }

    public CorpusBuilder latentDimensions(int i) {
        this.latentDimensions = i;
        return this;
    }

    public Hapax build() {
        return new Hapax(this);
    }

    public CorpusBuilder addDocument(String str, InputStream inputStream) {
        this.corpus.putDocument(str, this.scanner.fromInpuStream(inputStream));
        return this;
    }
}
