View Javadoc
1   package org.owasp.dependencycheck.xml;
2   
3   import java.io.FilterInputStream;
4   import java.io.IOException;
5   import java.io.InputStream;
6   import javax.annotation.concurrent.NotThreadSafe;
7   
8   import org.jetbrains.annotations.NotNull;
9   import org.slf4j.Logger;
10  import org.slf4j.LoggerFactory;
11  
12  /**
13   * Cleans up often very bad XML. Primarily, this will convert named HTM entities
14   * into their HTM encoded Unicode code point representation.
15   *
16   * <ol>
17   * <li>Strips leading white space</li>
18   * <li>Recodes &amp;pound; etc to &amp;#...;</li>
19   * <li>Recodes lone &amp; as &amp;amp;</li>
20   * </ol>
21   * <p>
22   * This is a slightly modified (class/method rename) from an SO answer:
23   * https://stackoverflow.com/questions/7286428/help-the-java-sax-parser-to-understand-bad-xml</p>
24   *
25   * @author https://stackoverflow.com/users/823393/oldcurmudgeon
26   */
27  @NotThreadSafe
28  public class XmlInputStream extends FilterInputStream {
29  
30      /**
31       * The logger.
32       */
33      private static final Logger LOGGER = LoggerFactory.getLogger(XmlInputStream.class);
34      /**
35       * The minimum length of characters to read.
36       */
37      private static final int MIN_LENGTH = 2;
38      /**
39       * Holder for everything we've read.
40       */
41      private final StringBuilder red = new StringBuilder();
42      /**
43       * Data that needs to be pushed back.
44       */
45      private final StringBuilder pushBack = new StringBuilder();
46      /**
47       * How much we've given them.
48       */
49      private int given = 0;
50      /**
51       * How much we've read.
52       */
53      private int pulled = 0;
54  
55      /**
56       * Constructs a new XML Input Stream.
57       *
58       * @param in the base input stream
59       */
60      public XmlInputStream(InputStream in) {
61          super(in);
62      }
63  
64      /**
65       * NB: This is a Troll length (i.e. it goes 1, 2, many) so 2 actually means
66       * "at least 2"
67       *
68       * @return the length
69       */
70      public int length() {
71          try {
72              final StringBuilder s = read(MIN_LENGTH);
73              pushBack.append(s);
74              return s.length();
75          } catch (IOException ex) {
76              LOGGER.warn("Oops ", ex);
77          }
78          return 0;
79      }
80  
81      /**
82       * Read n characters.
83       *
84       * @param n the number of characters to read
85       * @return the characters read
86       * @throws IOException thrown when an error occurs
87       */
88      private StringBuilder read(int n) throws IOException {
89          // Input stream finished?
90          boolean eof = false;
91          // Read that many.
92          final StringBuilder s = new StringBuilder(n);
93          while (s.length() < n && !eof) {
94              // Always get from the pushBack buffer.
95              if (pushBack.length() == 0) {
96                  // Read something from the stream into pushBack.
97                  eof = readIntoPushBack();
98              }
99  
100             // Pushback only contains deliverable codes.
101             if (pushBack.length() > 0) {
102                 // Grab one character
103                 s.append(pushBack.charAt(0));
104                 // Remove it from pushBack
105                 pushBack.deleteCharAt(0);
106             }
107 
108         }
109         return s;
110     }
111 
112     /**
113      * Might not actually push back anything but usually will.
114      *
115      * @return true if at end-of-file
116      * @throws IOException thrown if there is an IO exception in the underlying
117      * steam
118      */
119     private boolean readIntoPushBack() throws IOException {
120         // File finished?
121         boolean eof = false;
122         // Next char.
123         final int ch = in.read();
124         if (ch >= 0) {
125             // Discard whitespace at start?
126             if (!(pulled == 0 && isWhiteSpace(ch))) {
127                 // Good code.
128                 pulled += 1;
129                 // Parse out the &stuff;
130                 if (ch == '&') {
131                     // Process the &
132                     readAmpersand();
133                 } else {
134                     // Not an '&', just append.
135                     pushBack.append((char) ch);
136                 }
137             }
138         } else {
139             // Hit end of file.
140             eof = true;
141         }
142         return eof;
143     }
144 
145     /**
146      * Deal with an ampersand in the stream.
147      *
148      * @throws IOException thrown if an unknown entity is encountered
149      */
150     private void readAmpersand() throws IOException {
151         // Read the whole word, up to and including the ;
152         final StringBuilder reference = new StringBuilder();
153         int ch;
154         // Should end in a ';'
155         for (ch = in.read(); isAlphaNumeric(ch); ch = in.read()) {
156             reference.append((char) ch);
157         }
158         // Did we tidily finish?
159         if (ch == ';') {
160             // Yes! Translate it into a &#nnn; code.
161             final String code = XmlEntity.fromNamedReference(reference);
162             if (code != null) {
163                 // Keep it.
164                 pushBack.append(code);
165             } else {
166                 // invalid entity. Encode the & and append the sequence of chars.
167                 pushBack.append("&#38;").append(reference).append((char) ch);
168             }
169         } else {
170             // Did not terminate properly!
171             // Perhaps an & on its own or a malformed reference.
172             // Either way, escape the &
173             pushBack.append("&#38;").append(reference).append((char) ch);
174         }
175     }
176 
177     /**
178      * Keep track of what we've given them.
179      *
180      * @param s the sequence of characters given
181      * @param wanted the number of characters wanted
182      * @param got the number of characters given
183      */
184     private void given(CharSequence s, int wanted, int got) {
185         red.append(s);
186         given += got;
187         LOGGER.trace("Given: [" + wanted + "," + got + "]-" + s);
188     }
189 
190     /**
191      * Reads the next byte.
192      *
193      * @return the byte read
194      * @throws IOException thrown when there is an problem reading
195      */
196     @Override
197     public int read() throws IOException {
198         final StringBuilder s = read(1);
199         given(s, 1, 1);
200         return s.length() > 0 ? s.charAt(0) : -1;
201     }
202 
203     /**
204      * Reads the next length of bytes from the stream into the given byte array
205      * at the given offset.
206      *
207      * @param data the buffer to store the data read
208      * @param offset the offset in the buffer to start writing
209      * @param length the length of data to read
210      * @return the number of bytes read
211      * @throws IOException thrown when there is an issue with the underlying
212      * stream
213      */
214     @Override
215     public int read(@NotNull byte[] data, int offset, int length) throws IOException {
216         final StringBuilder s = read(length);
217         int n = 0;
218         for (int i = 0; i < Math.min(length, s.length()); i++) {
219             data[offset + i] = (byte) s.charAt(i);
220             n += 1;
221         }
222         given(s, length, n);
223         return n > 0 ? n : -1;
224     }
225 
226     /**
227      * To string implementation.
228      *
229      * @return a string representation of the data given and read from the
230      * stream.
231      */
232     @Override
233     public String toString() {
234         final String s = red.toString();
235         final StringBuilder h = new StringBuilder();
236         // Hex dump the small ones.
237         if (s.length() < 8) {
238             for (int i = 0; i < s.length(); i++) {
239                 h.append(" ").append(Integer.toHexString(s.charAt(i)));
240             }
241         }
242         return "[" + given + "]-\"" + s + "\"" + (h.length() > 0 ? " (" + h + ")" : "");
243     }
244 
245     /**
246      * Determines if the character is whitespace.
247      *
248      * @param ch the character to check
249      * @return true if the character is whitespace; otherwise false
250      */
251     private boolean isWhiteSpace(int ch) {
252         switch (ch) {
253             case ' ':
254             case '\r':
255             case '\n':
256             case '\t':
257                 return true;
258             default:
259                 return false;
260         }
261     }
262 
263     /**
264      * Checks if the given character is alpha-numeric.
265      *
266      * @param ch the character to check
267      * @return true if the character is alpha-numeric; otherwise false.
268      */
269     private boolean isAlphaNumeric(int ch) {
270         return ('a' <= ch && ch <= 'z')
271                 || ('A' <= ch && ch <= 'Z')
272                 || ('0' <= ch && ch <= '9');
273     }
274 }