DescriptionEcosystemMapper.java

/*
 * This file is part of dependency-check-core.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * Copyright (c) 2020 The OWASP Foundation. All Rights Reserved.
 */
package org.owasp.dependencycheck.data.nvd.ecosystem;

import org.apache.commons.lang3.StringUtils;
import io.github.jeremylong.openvulnerability.client.nvd.DefCveItem;

import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;

/**
 * Helper utility for mapping CVEs to their ecosystems based on the description.
 *
 * @author skjolber
 */
public class DescriptionEcosystemMapper {

    // static fields for thread-safe + hardcoded functionality
    /**
     * The array of ecosystems.
     */
    private static final String[] ECOSYSTEMS;
    /**
     * A helper map to retrieve the index of an ecosystem.
     */
    private static final int[] HINT_TO_ECOSYSTEM_LOOKUP;
    /**
     * Map of strings to ecosystems.
     */
    private static final TreeMap<String, EcosystemHint> ECOSYSTEM_MAP; // thread safe for reading

    static {
        ECOSYSTEM_MAP = new TreeMap<>();

        for (FileExtensionHint fileExtensionHint : FileExtensionHint.values()) {
            ECOSYSTEM_MAP.put(fileExtensionHint.getValue(), fileExtensionHint);
        }
        for (DescriptionKeywordHint descriptionKeywordHint : DescriptionKeywordHint.values()) {
            ECOSYSTEM_MAP.put(descriptionKeywordHint.getValue(), descriptionKeywordHint);
        }

        final Map<String, Integer> ecosystemIndexes = new HashMap<>();

        HINT_TO_ECOSYSTEM_LOOKUP = new int[ECOSYSTEM_MAP.size()];

        int index = 0;
        for (Entry<String, EcosystemHint> entry : ECOSYSTEM_MAP.entrySet()) {
            final EcosystemHint ecosystemHint = entry.getValue();

            Integer ecosystemIndex = ecosystemIndexes.get(ecosystemHint.getEcosystem());
            if (ecosystemIndex == null) {
                ecosystemIndex = ecosystemIndexes.size();

                ecosystemIndexes.put(ecosystemHint.getEcosystem(), ecosystemIndex);
            }

            HINT_TO_ECOSYSTEM_LOOKUP[index] = ecosystemIndex;

            index++;
        }

        ECOSYSTEMS = new String[ecosystemIndexes.size()];
        ecosystemIndexes.forEach((key, value) -> ECOSYSTEMS[value] = key);
    }

    // take advantage of chars also being numbers
    /**
     * Prefix prefix for matching ecosystems.
     */
    private final boolean[] keywordPrefixes = getPrefixesFor(" -(\"'");
    /**
     * Postfix prefix for matching ecosystems.
     */
    private final boolean[] keywordPostfixes = getPrefixesFor(" -)\"',.:;");
    /**
     * Aho Corasick double array trie used for parsing and matching ecosystems.
     */
    private final StringAhoCorasickDoubleArrayTrie<EcosystemHint> ahoCorasickDoubleArrayTrie;

    /**
     * Constructs a new description ecosystem mapper.
     */
    public DescriptionEcosystemMapper() {
        ahoCorasickDoubleArrayTrie = toAhoCorasickDoubleArrayTrie();
    }

    protected static boolean[] getPrefixesFor(String str) {
        int max = -1;
        for (int i = 0; i < str.length(); i++) {
            if (max < str.charAt(i)) {
                max = str.charAt(i);
            }
        }

        final boolean[] delimiters = new boolean[max + 1];
        for (int i = 0; i < str.length(); i++) {
            delimiters[str.charAt(i)] = true;
        }
        return delimiters;
    }

    protected static StringAhoCorasickDoubleArrayTrie<EcosystemHint> toAhoCorasickDoubleArrayTrie() {
        final StringAhoCorasickDoubleArrayTrie<EcosystemHint> exact = new StringAhoCorasickDoubleArrayTrie<>();
        exact.build(ECOSYSTEM_MAP);
        return exact;
    }

    protected static boolean isExtension(String str, int begin, int end) {
        if (str.length() != end && Character.isLetterOrDigit(str.charAt(end))) {
            return false;
        }

        return isLowercaseAscii(str, begin + 1, end);
    }

    protected static boolean isLowercaseAscii(String multicase, int start, int end) {
        for (int i = start; i < end; i++) {
            final char c = multicase.charAt(i);

            if (c < 'a' || c > 'z') {
                return false;
            }
        }
        return true;
    }

    /**
     * Tests if the string is a URL by looking for '://'.
     *
     * @param c the text to test.
     * @param begin the position in the string to begin searching; note the
     * search is decreasing to 0
     * @return <code>true</code> if `://` is found; otherwise <code>false</code>
     */
    public static boolean isURL(String c, int begin) {
        int pos = begin - 2;

        while (pos > 2) {
            pos--;

            if (c.charAt(pos) == ' ') {
                return false;
            }
            if (c.charAt(pos) == ':') {
                return c.charAt(pos + 1) == '/' && c.charAt(pos + 2) == '/';
            }
        }

        return false;
    }

    protected void increment(int i, int[] ecosystemMap) {
        ecosystemMap[HINT_TO_ECOSYSTEM_LOOKUP[i]]++;
    }

    /**
     * Returns the ecosystem if identified by English description from the CVE
     * data.
     *
     * @param cve the CVE data
     * @return the ecosystem if identified
     */
    public String getEcosystem(DefCveItem cve) {
        final int[] ecosystemMap = new int[ECOSYSTEMS.length];
        cve.getCve().getDescriptions().stream()
                .filter((langString) -> (langString.getLang().equals("en")))
                .forEachOrdered((langString) -> search(langString.getValue(), ecosystemMap));
        return getResult(ecosystemMap);
    }

    /**
     * Determines the ecosystem for the given string.
     *
     * @param multicase the string to test
     * @return the ecosystem
     */
    public String getEcosystem(String multicase) {
        final int[] ecosystemMap = new int[ECOSYSTEMS.length];
        search(multicase, ecosystemMap);
        return getResult(ecosystemMap);
    }

    private void search(String multicase, int[] ecosystemMap) {
        final String c = multicase.toLowerCase();
        ahoCorasickDoubleArrayTrie.parseText(c, (begin, end, value, index) -> {
            if (value.getNature() == EcosystemHintNature.FILE_EXTENSION) {
                if (!isExtension(multicase, begin, end)) {
                    return;
                }

                final String ecosystem = value.getEcosystem();
                // real extension, if not part of url
                if (Ecosystem.PHP.equals(ecosystem) && c.regionMatches(begin, ".php", 0, 4)) {
                    if (isURL(c, begin)) {
                        return;
                    }
                } else if (Ecosystem.JAVA.equals(ecosystem) && c.regionMatches(begin, ".jsp", 0, 4)) {
                    if (isURL(c, begin)) {
                        return;
                    }
                }
            } else { // keyword

                // check if full word, i.e. typically space first and then space or dot after
                if (begin != 0) {
                    final char startChar = c.charAt(begin - 1);
                    if (startChar >= keywordPrefixes.length || !keywordPrefixes[startChar]) {
                        return;
                    }
                }
                if (end != c.length()) {
                    final char endChar = c.charAt(end);
                    if (endChar >= keywordPostfixes.length || !keywordPostfixes[endChar]) {
                        return;
                    }
                }

                final String ecosystem = value.getEcosystem();
                if (Ecosystem.NATIVE.equals(ecosystem)) { // TODO could be checked afterwards
                    if (StringUtils.contains(c, "android")) {
                        return;
                    }
                }
            }
            increment(index, ecosystemMap);
        });
    }

    private String getResult(int[] values) {
        final int best = getBestScore(values);
        if (best != -1) {
            return ECOSYSTEMS[best];
        }
        return null;
    }

    private int getBestScore(int[] values) {
        int bestIndex = -1;
        int bestScore = -1;
        for (int i = 0; i < values.length; i++) {
            if (values[i] > 0) {
                if (values[i] > bestScore) {
                    bestIndex = i;
                    bestScore = values[i];
                }
                values[i] = 0;
            }
        }
        return bestIndex;
    }
}