package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.WordTag;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.trees.Tree;
import java.io.Serializable;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

/* loaded from: input_file:edu/stanford/nlp/parser/lexparser/ChineseUnknownWordModel.class */
public class ChineseUnknownWordModel implements Serializable {
    private static final String encoding = "GB18030";
    private static final boolean VERBOSE = false;
    private static final String unknown = "UNK";
    private static final String dateMatch = ".*[年月日号]";
    private static final String numberMatch = ".*[０１２３４５６７８９１一二三四五六七八九十百千万亿].*";
    private static final String ordinalMatch = "第.*";
    private static final String properNameMatch = ".*·.*";
    private static final long serialVersionUID = 221;
    private boolean useFirst = true;
    private boolean useGT = false;
    boolean useUnicodeType = false;
    private Map<String, Counter<String>> tagHash = new HashMap();
    private Set seenFirst = new HashSet();
    private Map unknownGT = new HashMap();

    /* JADX INFO: Access modifiers changed from: package-private */
    public void useGoodTuring() {
        this.useGT = true;
        this.useFirst = false;
    }

    public double score(IntTaggedWord intTaggedWord) {
        return score(intTaggedWord.toTaggedWord());
    }

    public double score(TaggedWord taggedWord) {
        double scoreGT;
        int type;
        String word = taggedWord.word();
        String tag = taggedWord.tag();
        if (word.matches(dateMatch)) {
            scoreGT = tag.equals("NT") ? 0.0d : Double.NEGATIVE_INFINITY;
        } else if (word.matches(numberMatch)) {
            scoreGT = (!tag.equals("CD") || word.matches(ordinalMatch)) ? (tag.equals("OD") && word.matches(ordinalMatch)) ? 0.0d : Double.NEGATIVE_INFINITY : 0.0d;
        } else if (word.matches(properNameMatch)) {
            scoreGT = tag.equals("NR") ? 0.0d : Double.NEGATIVE_INFINITY;
        } else if (this.useFirst) {
            String substring = word.substring(0, 1);
            if (this.useUnicodeType && (type = Character.getType(word.charAt(0))) != 5) {
                substring = Integer.toString(type);
            }
            if (!this.seenFirst.contains(substring)) {
                if (this.useGT) {
                    scoreGT = scoreGT(tag);
                } else {
                    substring = "UNK";
                }
            }
            Counter<String> counter = this.tagHash.get(tag);
            scoreGT = counter == null ? Double.NEGATIVE_INFINITY : counter.containsKey(substring) ? counter.getCount(substring) : counter.getCount("UNK");
        } else {
            scoreGT = this.useGT ? scoreGT(tag) : Double.NEGATIVE_INFINITY;
        }
        return scoreGT;
    }

    private double scoreGT(String str) {
        return this.unknownGT.containsKey(str) ? ((Double) this.unknownGT.get(str)).doubleValue() : Double.NEGATIVE_INFINITY;
    }

    public void train(Collection<Tree> collection) {
        int type;
        if (this.useFirst) {
            System.err.println("ChineseUWM: treating unknown word as the average of their equivalents by first-character identity. useUnicodeType: " + this.useUnicodeType);
        }
        if (this.useGT) {
            System.err.println("ChineseUWM: using Good-Turing smoothing for unknown words.");
        }
        trainUnknownGT(collection);
        HashMap hashMap = new HashMap();
        Counter counter = new Counter();
        Iterator<Tree> it = collection.iterator();
        while (it.hasNext()) {
            for (TaggedWord taggedWord : it.next().taggedYield()) {
                String word = taggedWord.word();
                String substring = taggedWord.word().substring(0, 1);
                if (this.useUnicodeType && (type = Character.getType(word.charAt(0))) != 5) {
                    substring = Integer.toString(type);
                }
                String tag = taggedWord.tag();
                if (!hashMap.containsKey(tag)) {
                    hashMap.put(tag, new Counter());
                }
                ((Counter) hashMap.get(tag)).incrementCount(substring);
                counter.incrementCount(tag);
                this.seenFirst.add(substring);
            }
        }
        for (String str : hashMap.keySet()) {
            Counter counter2 = (Counter) hashMap.get(str);
            if (!this.tagHash.containsKey(str)) {
                this.tagHash.put(str, new Counter<>());
            }
            counter.incrementCount(str);
            counter2.setCount((Counter) "UNK", 1.0d);
            for (String str2 : counter2.keySet()) {
                this.tagHash.get(str).setCount((Counter<String>) str2, Math.log(counter2.getCount(str2) / counter.getCount(str)));
            }
        }
    }

    private void trainUnknownGT(Collection<Tree> collection) {
        Counter counter = new Counter();
        Counter counter2 = new Counter();
        Counter counter3 = new Counter();
        Counter counter4 = new Counter();
        Counter counter5 = new Counter();
        HashSet hashSet = new HashSet();
        int i = 0;
        Iterator<Tree> it = collection.iterator();
        while (it.hasNext()) {
            for (TaggedWord taggedWord : it.next().taggedYield()) {
                i++;
                WordTag wordTag = toWordTag(taggedWord);
                String word = wordTag.word();
                String tag = wordTag.tag();
                counter2.incrementCount(wordTag);
                counter.incrementCount(taggedWord);
                counter3.incrementCount(tag);
                hashSet.add(word);
            }
        }
        System.err.println("Total tokens: " + i + " [num words + numSent (boundarySymbols)]");
        System.err.println("Total WordTag types: " + counter2.keySet().size());
        System.err.println("Total TaggedWord types: " + counter.keySet().size() + " [should equal word types!]");
        System.err.println("Total tag types: " + counter3.keySet().size());
        System.err.println("Total word types: " + hashSet.size());
        for (WordTag wordTag2 : counter2.keySet()) {
            if (counter2.getCount(wordTag2) == 1.0d) {
                counter4.incrementCount(wordTag2.tag());
            }
        }
        for (String str : counter3.keySet()) {
            Iterator it2 = hashSet.iterator();
            while (it2.hasNext()) {
                if (!counter2.containsKey(new WordTag((String) it2.next(), str))) {
                    counter5.incrementCount(str);
                }
            }
        }
        for (String str2 : counter3.keySet()) {
            this.unknownGT.put(str2, new Double(Math.log(counter4.getCount(str2) / (counter3.getCount(str2) * counter5.getCount(str2)))));
        }
    }

    public static void main(String[] strArr) {
        System.out.println("Testing unknown matching");
        if ("刘·革命".matches(properNameMatch)) {
            System.out.println("hooray names!");
        } else {
            System.out.println("Uh-oh names!");
        }
        if ("３０００".matches(numberMatch)) {
            System.out.println("hooray numbers!");
        } else {
            System.out.println("Uh-oh numbers!");
        }
        if ("３０００".matches(numberMatch)) {
            System.out.println("hooray numbers!");
        } else {
            System.out.println("Uh-oh numbers!");
        }
        if ("３０００".matches(numberMatch)) {
            System.out.println("hooray numbers!");
        } else {
            System.out.println("Uh-oh numbers!");
        }
        if ("三月".matches(dateMatch)) {
            System.out.println("hooray dates!");
        } else {
            System.out.println("Uh-oh dates!");
        }
        System.out.println("Testing tagged word");
        Counter counter = new Counter();
        TaggedWord taggedWord = new TaggedWord("w", "t");
        counter.incrementCount(taggedWord);
        TaggedWord taggedWord2 = new TaggedWord("w", "t2");
        System.out.println(counter.containsKey(taggedWord2));
        System.out.println(taggedWord.equals(taggedWord2));
        WordTag wordTag = toWordTag(taggedWord);
        WordTag wordTag2 = toWordTag(taggedWord2);
        WordTag wordTag3 = new WordTag("w", "t2");
        System.out.println(wordTag.equals(wordTag2));
        System.out.println(wordTag2.equals(wordTag3));
    }

    private static WordTag toWordTag(TaggedWord taggedWord) {
        return new WordTag(taggedWord.word(), taggedWord.tag());
    }
}
