lexer.go (10251B)
1 package jmespath 2 3 import ( 4 "bytes" 5 "encoding/json" 6 "fmt" 7 "strconv" 8 "strings" 9 "unicode/utf8" 10 ) 11 12 type token struct { 13 tokenType tokType 14 value string 15 position int 16 length int 17 } 18 19 type tokType int 20 21 const eof = -1 22 23 // Lexer contains information about the expression being tokenized. 24 type Lexer struct { 25 expression string // The expression provided by the user. 26 currentPos int // The current position in the string. 27 lastWidth int // The width of the current rune. This 28 buf bytes.Buffer // Internal buffer used for building up values. 29 } 30 31 // SyntaxError is the main error used whenever a lexing or parsing error occurs. 32 type SyntaxError struct { 33 msg string // Error message displayed to user 34 Expression string // Expression that generated a SyntaxError 35 Offset int // The location in the string where the error occurred 36 } 37 38 func (e SyntaxError) Error() string { 39 // In the future, it would be good to underline the specific 40 // location where the error occurred. 41 return "SyntaxError: " + e.msg 42 } 43 44 // HighlightLocation will show where the syntax error occurred. 45 // It will place a "^" character on a line below the expression 46 // at the point where the syntax error occurred. 47 func (e SyntaxError) HighlightLocation() string { 48 return e.Expression + "\n" + strings.Repeat(" ", e.Offset) + "^" 49 } 50 51 //go:generate stringer -type=tokType 52 const ( 53 tUnknown tokType = iota 54 tStar 55 tDot 56 tFilter 57 tFlatten 58 tLparen 59 tRparen 60 tLbracket 61 tRbracket 62 tLbrace 63 tRbrace 64 tOr 65 tPipe 66 tNumber 67 tUnquotedIdentifier 68 tQuotedIdentifier 69 tComma 70 tColon 71 tLT 72 tLTE 73 tGT 74 tGTE 75 tEQ 76 tNE 77 tJSONLiteral 78 tStringLiteral 79 tCurrent 80 tExpref 81 tAnd 82 tNot 83 tEOF 84 ) 85 86 var basicTokens = map[rune]tokType{ 87 '.': tDot, 88 '*': tStar, 89 ',': tComma, 90 ':': tColon, 91 '{': tLbrace, 92 '}': tRbrace, 93 ']': tRbracket, // tLbracket not included because it could be "[]" 94 '(': tLparen, 95 ')': tRparen, 96 '@': tCurrent, 97 } 98 99 // Bit mask for [a-zA-Z_] shifted down 64 bits to fit in a single uint64. 100 // When using this bitmask just be sure to shift the rune down 64 bits 101 // before checking against identifierStartBits. 102 const identifierStartBits uint64 = 576460745995190270 103 104 // Bit mask for [a-zA-Z0-9], 128 bits -> 2 uint64s. 105 var identifierTrailingBits = [2]uint64{287948901175001088, 576460745995190270} 106 107 var whiteSpace = map[rune]bool{ 108 ' ': true, '\t': true, '\n': true, '\r': true, 109 } 110 111 func (t token) String() string { 112 return fmt.Sprintf("Token{%+v, %s, %d, %d}", 113 t.tokenType, t.value, t.position, t.length) 114 } 115 116 // NewLexer creates a new JMESPath lexer. 117 func NewLexer() *Lexer { 118 lexer := Lexer{} 119 return &lexer 120 } 121 122 func (lexer *Lexer) next() rune { 123 if lexer.currentPos >= len(lexer.expression) { 124 lexer.lastWidth = 0 125 return eof 126 } 127 r, w := utf8.DecodeRuneInString(lexer.expression[lexer.currentPos:]) 128 lexer.lastWidth = w 129 lexer.currentPos += w 130 return r 131 } 132 133 func (lexer *Lexer) back() { 134 lexer.currentPos -= lexer.lastWidth 135 } 136 137 func (lexer *Lexer) peek() rune { 138 t := lexer.next() 139 lexer.back() 140 return t 141 } 142 143 // tokenize takes an expression and returns corresponding tokens. 144 func (lexer *Lexer) tokenize(expression string) ([]token, error) { 145 var tokens []token 146 lexer.expression = expression 147 lexer.currentPos = 0 148 lexer.lastWidth = 0 149 loop: 150 for { 151 r := lexer.next() 152 if identifierStartBits&(1<<(uint64(r)-64)) > 0 { 153 t := lexer.consumeUnquotedIdentifier() 154 tokens = append(tokens, t) 155 } else if val, ok := basicTokens[r]; ok { 156 // Basic single char token. 157 t := token{ 158 tokenType: val, 159 value: string(r), 160 position: lexer.currentPos - lexer.lastWidth, 161 length: 1, 162 } 163 tokens = append(tokens, t) 164 } else if r == '-' || (r >= '0' && r <= '9') { 165 t := lexer.consumeNumber() 166 tokens = append(tokens, t) 167 } else if r == '[' { 168 t := lexer.consumeLBracket() 169 tokens = append(tokens, t) 170 } else if r == '"' { 171 t, err := lexer.consumeQuotedIdentifier() 172 if err != nil { 173 return tokens, err 174 } 175 tokens = append(tokens, t) 176 } else if r == '\'' { 177 t, err := lexer.consumeRawStringLiteral() 178 if err != nil { 179 return tokens, err 180 } 181 tokens = append(tokens, t) 182 } else if r == '`' { 183 t, err := lexer.consumeLiteral() 184 if err != nil { 185 return tokens, err 186 } 187 tokens = append(tokens, t) 188 } else if r == '|' { 189 t := lexer.matchOrElse(r, '|', tOr, tPipe) 190 tokens = append(tokens, t) 191 } else if r == '<' { 192 t := lexer.matchOrElse(r, '=', tLTE, tLT) 193 tokens = append(tokens, t) 194 } else if r == '>' { 195 t := lexer.matchOrElse(r, '=', tGTE, tGT) 196 tokens = append(tokens, t) 197 } else if r == '!' { 198 t := lexer.matchOrElse(r, '=', tNE, tNot) 199 tokens = append(tokens, t) 200 } else if r == '=' { 201 t := lexer.matchOrElse(r, '=', tEQ, tUnknown) 202 tokens = append(tokens, t) 203 } else if r == '&' { 204 t := lexer.matchOrElse(r, '&', tAnd, tExpref) 205 tokens = append(tokens, t) 206 } else if r == eof { 207 break loop 208 } else if _, ok := whiteSpace[r]; ok { 209 // Ignore whitespace 210 } else { 211 return tokens, lexer.syntaxError(fmt.Sprintf("Unknown char: %s", strconv.QuoteRuneToASCII(r))) 212 } 213 } 214 tokens = append(tokens, token{tEOF, "", len(lexer.expression), 0}) 215 return tokens, nil 216 } 217 218 // Consume characters until the ending rune "r" is reached. 219 // If the end of the expression is reached before seeing the 220 // terminating rune "r", then an error is returned. 221 // If no error occurs then the matching substring is returned. 222 // The returned string will not include the ending rune. 223 func (lexer *Lexer) consumeUntil(end rune) (string, error) { 224 start := lexer.currentPos 225 current := lexer.next() 226 for current != end && current != eof { 227 if current == '\\' && lexer.peek() != eof { 228 lexer.next() 229 } 230 current = lexer.next() 231 } 232 if lexer.lastWidth == 0 { 233 // Then we hit an EOF so we never reached the closing 234 // delimiter. 235 return "", SyntaxError{ 236 msg: "Unclosed delimiter: " + string(end), 237 Expression: lexer.expression, 238 Offset: len(lexer.expression), 239 } 240 } 241 return lexer.expression[start : lexer.currentPos-lexer.lastWidth], nil 242 } 243 244 func (lexer *Lexer) consumeLiteral() (token, error) { 245 start := lexer.currentPos 246 value, err := lexer.consumeUntil('`') 247 if err != nil { 248 return token{}, err 249 } 250 value = strings.Replace(value, "\\`", "`", -1) 251 return token{ 252 tokenType: tJSONLiteral, 253 value: value, 254 position: start, 255 length: len(value), 256 }, nil 257 } 258 259 func (lexer *Lexer) consumeRawStringLiteral() (token, error) { 260 start := lexer.currentPos 261 currentIndex := start 262 current := lexer.next() 263 for current != '\'' && lexer.peek() != eof { 264 if current == '\\' && lexer.peek() == '\'' { 265 chunk := lexer.expression[currentIndex : lexer.currentPos-1] 266 lexer.buf.WriteString(chunk) 267 lexer.buf.WriteString("'") 268 lexer.next() 269 currentIndex = lexer.currentPos 270 } 271 current = lexer.next() 272 } 273 if lexer.lastWidth == 0 { 274 // Then we hit an EOF so we never reached the closing 275 // delimiter. 276 return token{}, SyntaxError{ 277 msg: "Unclosed delimiter: '", 278 Expression: lexer.expression, 279 Offset: len(lexer.expression), 280 } 281 } 282 if currentIndex < lexer.currentPos { 283 lexer.buf.WriteString(lexer.expression[currentIndex : lexer.currentPos-1]) 284 } 285 value := lexer.buf.String() 286 // Reset the buffer so it can reused again. 287 lexer.buf.Reset() 288 return token{ 289 tokenType: tStringLiteral, 290 value: value, 291 position: start, 292 length: len(value), 293 }, nil 294 } 295 296 func (lexer *Lexer) syntaxError(msg string) SyntaxError { 297 return SyntaxError{ 298 msg: msg, 299 Expression: lexer.expression, 300 Offset: lexer.currentPos - 1, 301 } 302 } 303 304 // Checks for a two char token, otherwise matches a single character 305 // token. This is used whenever a two char token overlaps a single 306 // char token, e.g. "||" -> tPipe, "|" -> tOr. 307 func (lexer *Lexer) matchOrElse(first rune, second rune, matchedType tokType, singleCharType tokType) token { 308 start := lexer.currentPos - lexer.lastWidth 309 nextRune := lexer.next() 310 var t token 311 if nextRune == second { 312 t = token{ 313 tokenType: matchedType, 314 value: string(first) + string(second), 315 position: start, 316 length: 2, 317 } 318 } else { 319 lexer.back() 320 t = token{ 321 tokenType: singleCharType, 322 value: string(first), 323 position: start, 324 length: 1, 325 } 326 } 327 return t 328 } 329 330 func (lexer *Lexer) consumeLBracket() token { 331 // There's three options here: 332 // 1. A filter expression "[?" 333 // 2. A flatten operator "[]" 334 // 3. A bare rbracket "[" 335 start := lexer.currentPos - lexer.lastWidth 336 nextRune := lexer.next() 337 var t token 338 if nextRune == '?' { 339 t = token{ 340 tokenType: tFilter, 341 value: "[?", 342 position: start, 343 length: 2, 344 } 345 } else if nextRune == ']' { 346 t = token{ 347 tokenType: tFlatten, 348 value: "[]", 349 position: start, 350 length: 2, 351 } 352 } else { 353 t = token{ 354 tokenType: tLbracket, 355 value: "[", 356 position: start, 357 length: 1, 358 } 359 lexer.back() 360 } 361 return t 362 } 363 364 func (lexer *Lexer) consumeQuotedIdentifier() (token, error) { 365 start := lexer.currentPos 366 value, err := lexer.consumeUntil('"') 367 if err != nil { 368 return token{}, err 369 } 370 var decoded string 371 asJSON := []byte("\"" + value + "\"") 372 if err := json.Unmarshal([]byte(asJSON), &decoded); err != nil { 373 return token{}, err 374 } 375 return token{ 376 tokenType: tQuotedIdentifier, 377 value: decoded, 378 position: start - 1, 379 length: len(decoded), 380 }, nil 381 } 382 383 func (lexer *Lexer) consumeUnquotedIdentifier() token { 384 // Consume runes until we reach the end of an unquoted 385 // identifier. 386 start := lexer.currentPos - lexer.lastWidth 387 for { 388 r := lexer.next() 389 if r < 0 || r > 128 || identifierTrailingBits[uint64(r)/64]&(1<<(uint64(r)%64)) == 0 { 390 lexer.back() 391 break 392 } 393 } 394 value := lexer.expression[start:lexer.currentPos] 395 return token{ 396 tokenType: tUnquotedIdentifier, 397 value: value, 398 position: start, 399 length: lexer.currentPos - start, 400 } 401 } 402 403 func (lexer *Lexer) consumeNumber() token { 404 // Consume runes until we reach something that's not a number. 405 start := lexer.currentPos - lexer.lastWidth 406 for { 407 r := lexer.next() 408 if r < '0' || r > '9' { 409 lexer.back() 410 break 411 } 412 } 413 value := lexer.expression[start:lexer.currentPos] 414 return token{ 415 tokenType: tNumber, 416 value: value, 417 position: start, 418 length: lexer.currentPos - start, 419 } 420 }