package opennlp.tools.tokenize;

import java.util.LinkedList;
import java.util.Set;
import java.util.regex.Pattern;
import opennlp.tools.util.Span;

/* loaded from: classes2.dex */
public class WordpieceTokenizer implements Tokenizer {
    private static final String CLASSIFICATION_TOKEN = "[CLS]";
    private static final Pattern PUNCTUATION_PATTERN = Pattern.compile("\\p{Punct}+");
    private static final String SEPARATOR_TOKEN = "[SEP]";
    private static final String UNKNOWN_TOKEN = "[UNK]";
    private int maxTokenLength;
    private final Set<String> vocabulary;

    public WordpieceTokenizer(Set<String> set) {
        this.maxTokenLength = 50;
        this.vocabulary = set;
    }

    public WordpieceTokenizer(Set<String> set, int i2) {
        this(set);
        this.maxTokenLength = i2;
    }

    public int getMaxTokenLength() {
        return this.maxTokenLength;
    }

    @Override // opennlp.tools.tokenize.Tokenizer
    public String[] tokenize(String str) {
        int length;
        String valueOf;
        LinkedList linkedList = new LinkedList();
        linkedList.add(CLASSIFICATION_TOKEN);
        for (String str2 : WhitespaceTokenizer.INSTANCE.tokenize(PUNCTUATION_PATTERN.matcher(str).replaceAll(" $0 "))) {
            char[] charArray = str2.toCharArray();
            if (charArray.length <= this.maxTokenLength) {
                int i2 = 0;
                while (true) {
                    if (i2 < charArray.length) {
                        length = charArray.length;
                        while (i2 < length) {
                            valueOf = String.valueOf(charArray, i2, length - i2);
                            if (i2 > 0) {
                                valueOf = android.support.v4.media.a.k("##", valueOf);
                            }
                            if (this.vocabulary.contains(valueOf)) {
                                break;
                            }
                            length--;
                        }
                        linkedList.add(UNKNOWN_TOKEN);
                        break;
                    }
                    linkedList.add(valueOf);
                    i2 = length;
                }
            } else {
                linkedList.add(UNKNOWN_TOKEN);
            }
        }
        linkedList.add(SEPARATOR_TOKEN);
        return (String[]) linkedList.toArray(new String[0]);
    }

    @Override // opennlp.tools.tokenize.Tokenizer
    public Span[] tokenizePos(String str) {
        return null;
    }
}
