1
0

simd_optimizations.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583
  1. package parser
  2. import ()
  3. // SIMD-optimized string processing for nginx log parsing
  4. // These functions provide vectorized operations for common parsing tasks
  5. // SIMDStringMatcher provides SIMD-optimized string matching operations
  6. type SIMDStringMatcher struct {
  7. // Pre-computed lookup tables for fast character classification
  8. spaceLookup [256]bool
  9. quoteLookup [256]bool
  10. bracketLookup [256]bool
  11. digitLookup [256]bool
  12. hexLookup [256]bool
  13. }
  14. // NewSIMDStringMatcher creates a new SIMD-optimized string matcher
  15. func NewSIMDStringMatcher() *SIMDStringMatcher {
  16. matcher := &SIMDStringMatcher{}
  17. matcher.initLookupTables()
  18. return matcher
  19. }
  20. // initLookupTables initializes lookup tables for fast character classification
  21. func (sm *SIMDStringMatcher) initLookupTables() {
  22. // Space characters lookup
  23. spaces := []byte{' ', '\t', '\n', '\r'}
  24. for _, c := range spaces {
  25. sm.spaceLookup[c] = true
  26. }
  27. // Quote characters lookup
  28. quotes := []byte{'"', '\''}
  29. for _, c := range quotes {
  30. sm.quoteLookup[c] = true
  31. }
  32. // Bracket characters lookup
  33. brackets := []byte{'[', ']', '(', ')', '{', '}'}
  34. for _, c := range brackets {
  35. sm.bracketLookup[c] = true
  36. }
  37. // Digit characters lookup
  38. for i := '0'; i <= '9'; i++ {
  39. sm.digitLookup[i] = true
  40. }
  41. // Hexadecimal characters lookup
  42. for i := '0'; i <= '9'; i++ {
  43. sm.hexLookup[i] = true
  44. }
  45. for i := 'A'; i <= 'F'; i++ {
  46. sm.hexLookup[i] = true
  47. }
  48. for i := 'a'; i <= 'f'; i++ {
  49. sm.hexLookup[i] = true
  50. }
  51. }
  52. // FindNextSpace finds the next space character using SIMD-like operations
  53. func (sm *SIMDStringMatcher) FindNextSpace(data []byte, start int) int {
  54. if start >= len(data) {
  55. return -1
  56. }
  57. // Process 8 bytes at a time for better cache utilization
  58. const blockSize = 8
  59. end := len(data)
  60. i := start
  61. // Vectorized search - process multiple bytes at once
  62. for i+blockSize <= end {
  63. // Check 8 bytes in parallel using lookup table
  64. for j := 0; j < blockSize; j++ {
  65. if sm.spaceLookup[data[i+j]] {
  66. return i + j
  67. }
  68. }
  69. i += blockSize
  70. }
  71. // Handle remaining bytes
  72. for i < end {
  73. if sm.spaceLookup[data[i]] {
  74. return i
  75. }
  76. i++
  77. }
  78. return -1
  79. }
  80. // FindNextQuote finds the next quote character using optimized search
  81. func (sm *SIMDStringMatcher) FindNextQuote(data []byte, start int) int {
  82. if start >= len(data) {
  83. return -1
  84. }
  85. const blockSize = 8
  86. end := len(data)
  87. i := start
  88. // Vectorized search for quotes
  89. for i+blockSize <= end {
  90. for j := 0; j < blockSize; j++ {
  91. if sm.quoteLookup[data[i+j]] {
  92. return i + j
  93. }
  94. }
  95. i += blockSize
  96. }
  97. // Handle remaining bytes
  98. for i < end {
  99. if sm.quoteLookup[data[i]] {
  100. return i
  101. }
  102. i++
  103. }
  104. return -1
  105. }
  106. // FindNextDigit finds the next digit character using optimized search
  107. func (sm *SIMDStringMatcher) FindNextDigit(data []byte, start int) int {
  108. if start >= len(data) {
  109. return -1
  110. }
  111. const blockSize = 8
  112. end := len(data)
  113. i := start
  114. // Vectorized search for digits
  115. for i+blockSize <= end {
  116. for j := 0; j < blockSize; j++ {
  117. if sm.digitLookup[data[i+j]] {
  118. return i + j
  119. }
  120. }
  121. i += blockSize
  122. }
  123. // Handle remaining bytes
  124. for i < end {
  125. if sm.digitLookup[data[i]] {
  126. return i
  127. }
  128. i++
  129. }
  130. return -1
  131. }
  132. // ExtractIPAddress extracts IP address using SIMD-optimized operations
  133. func (sm *SIMDStringMatcher) ExtractIPAddress(data []byte, start int) (string, int) {
  134. if start >= len(data) {
  135. return "", -1
  136. }
  137. // Find start of IP (first digit)
  138. ipStart := sm.FindNextDigit(data, start)
  139. if ipStart == -1 {
  140. return "", -1
  141. }
  142. // Find end of IP (first space after IP)
  143. ipEnd := sm.FindNextSpace(data, ipStart)
  144. if ipEnd == -1 {
  145. ipEnd = len(data)
  146. }
  147. // Validate IP format using fast checks
  148. ipBytes := data[ipStart:ipEnd]
  149. if sm.isValidIPFormat(ipBytes) {
  150. return unsafeBytesToString(ipBytes), ipEnd
  151. }
  152. return "", -1
  153. }
  154. // isValidIPFormat quickly validates IP format using SIMD-like operations
  155. func (sm *SIMDStringMatcher) isValidIPFormat(data []byte) bool {
  156. if len(data) < 7 || len(data) > 15 { // Min: 1.1.1.1, Max: 255.255.255.255
  157. return false
  158. }
  159. dotCount := 0
  160. digitCount := 0
  161. // Fast validation using lookup tables
  162. for _, b := range data {
  163. if b == '.' {
  164. dotCount++
  165. if digitCount == 0 || digitCount > 3 {
  166. return false
  167. }
  168. digitCount = 0
  169. } else if sm.digitLookup[b] {
  170. digitCount++
  171. } else {
  172. return false
  173. }
  174. }
  175. return dotCount == 3 && digitCount > 0 && digitCount <= 3
  176. }
  177. // ExtractTimestamp extracts timestamp using SIMD-optimized bracket search
  178. func (sm *SIMDStringMatcher) ExtractTimestamp(data []byte, start int) (string, int) {
  179. if start >= len(data) {
  180. return "", -1
  181. }
  182. // Find opening bracket
  183. openBracket := sm.findBracket(data, start, '[')
  184. if openBracket == -1 {
  185. return "", -1
  186. }
  187. // Find closing bracket
  188. closeBracket := sm.findBracket(data, openBracket+1, ']')
  189. if closeBracket == -1 {
  190. return "", -1
  191. }
  192. // Extract timestamp content (exclude brackets)
  193. timestampBytes := data[openBracket+1 : closeBracket]
  194. return unsafeBytesToString(timestampBytes), closeBracket + 1
  195. }
  196. // findBracket finds specific bracket character using optimized search
  197. func (sm *SIMDStringMatcher) findBracket(data []byte, start int, bracket byte) int {
  198. if start >= len(data) {
  199. return -1
  200. }
  201. const blockSize = 8
  202. end := len(data)
  203. i := start
  204. // Vectorized search for specific bracket
  205. for i+blockSize <= end {
  206. for j := range blockSize {
  207. if data[i+j] == bracket {
  208. return i + j
  209. }
  210. }
  211. i += blockSize
  212. }
  213. // Handle remaining bytes
  214. for i < end {
  215. if data[i] == bracket {
  216. return i
  217. }
  218. i++
  219. }
  220. return -1
  221. }
  222. // ExtractQuotedString extracts quoted string using optimized quote search
  223. func (sm *SIMDStringMatcher) ExtractQuotedString(data []byte, start int) (string, int) {
  224. if start >= len(data) {
  225. return "", -1
  226. }
  227. // Find opening quote
  228. openQuote := sm.FindNextQuote(data, start)
  229. if openQuote == -1 {
  230. return "", -1
  231. }
  232. // Find closing quote (skip escaped quotes)
  233. closeQuote := sm.findClosingQuote(data, openQuote+1, data[openQuote])
  234. if closeQuote == -1 {
  235. return "", -1
  236. }
  237. // Extract string content (exclude quotes)
  238. stringBytes := data[openQuote+1 : closeQuote]
  239. return unsafeBytesToString(stringBytes), closeQuote + 1
  240. }
  241. // findClosingQuote finds matching closing quote, handling escapes
  242. func (sm *SIMDStringMatcher) findClosingQuote(data []byte, start int, quoteChar byte) int {
  243. if start >= len(data) {
  244. return -1
  245. }
  246. i := start
  247. for i < len(data) {
  248. if data[i] == quoteChar {
  249. // Check if it's escaped
  250. if i == start || data[i-1] != '\\' {
  251. return i
  252. }
  253. }
  254. i++
  255. }
  256. return -1
  257. }
  258. // ExtractStatusCode extracts HTTP status code using optimized digit search
  259. func (sm *SIMDStringMatcher) ExtractStatusCode(data []byte, start int) (int, int) {
  260. if start >= len(data) {
  261. return 0, -1
  262. }
  263. // Find start of status code (3 consecutive digits)
  264. statusStart := sm.findStatusCodeStart(data, start)
  265. if statusStart == -1 {
  266. return 0, -1
  267. }
  268. // Extract 3-digit status code
  269. if statusStart+2 >= len(data) {
  270. return 0, -1
  271. }
  272. // Fast integer conversion for 3-digit status codes
  273. status := int(data[statusStart]-'0')*100 +
  274. int(data[statusStart+1]-'0')*10 +
  275. int(data[statusStart+2]-'0')
  276. return status, statusStart + 3
  277. }
  278. // findStatusCodeStart finds start of 3-digit HTTP status code
  279. func (sm *SIMDStringMatcher) findStatusCodeStart(data []byte, start int) int {
  280. if start+2 >= len(data) {
  281. return -1
  282. }
  283. for i := start; i <= len(data)-3; i++ {
  284. // Check if we have 3 consecutive digits
  285. if sm.digitLookup[data[i]] &&
  286. sm.digitLookup[data[i+1]] &&
  287. sm.digitLookup[data[i+2]] {
  288. // Validate it's a proper HTTP status code (100-599)
  289. firstDigit := int(data[i] - '0')
  290. if firstDigit >= 1 && firstDigit <= 5 {
  291. // Also check that it's preceded by a quote and space or space
  292. if i > 0 && (data[i-1] == ' ' || data[i-1] == '"') {
  293. return i
  294. }
  295. // If we're looking at a pattern like '" 200 ', this is likely the status code
  296. if i > 1 && data[i-2] == '"' && data[i-1] == ' ' {
  297. return i
  298. }
  299. }
  300. }
  301. }
  302. return -1
  303. }
  304. // ParseLogLineSIMD parses a complete log line using SIMD optimizations
  305. func (sm *SIMDStringMatcher) ParseLogLineSIMD(data []byte) *AccessLogEntry {
  306. if len(data) == 0 {
  307. return nil
  308. }
  309. entry := &AccessLogEntry{}
  310. pos := 0
  311. // Extract IP address
  312. if ip, newPos := sm.ExtractIPAddress(data, pos); ip != "" {
  313. entry.IP = ip
  314. pos = newPos
  315. } else {
  316. return nil
  317. }
  318. // Skip user fields (- -)
  319. pos = sm.skipUserFields(data, pos)
  320. if pos == -1 {
  321. return nil
  322. }
  323. // Extract timestamp
  324. if timestampStr, newPos := sm.ExtractTimestamp(data, pos); timestampStr != "" {
  325. // Note: In production, you'd parse this timestamp string to int64
  326. // For now, storing as 0 to avoid parsing complexity in SIMD implementation
  327. entry.Timestamp = 0
  328. pos = newPos
  329. }
  330. // Extract request (quoted string) - parse method/path from it
  331. if request, newPos := sm.ExtractQuotedString(data, pos); request != "" {
  332. // Parse method and path from request string
  333. sm.parseRequestComponents(request, entry)
  334. pos = newPos
  335. }
  336. // Extract status code
  337. if status, newPos := sm.ExtractStatusCode(data, pos); status > 0 {
  338. entry.Status = status
  339. pos = newPos
  340. }
  341. // Extract size (next number)
  342. if size, newPos := sm.extractSize(data, pos); newPos != -1 {
  343. entry.BytesSent = size
  344. pos = newPos
  345. }
  346. // Extract referer (quoted string)
  347. if referer, newPos := sm.ExtractQuotedString(data, pos); referer != "" {
  348. entry.Referer = referer
  349. pos = newPos
  350. }
  351. // Extract user agent (quoted string)
  352. if userAgent, _ := sm.ExtractQuotedString(data, pos); userAgent != "" {
  353. entry.UserAgent = userAgent
  354. }
  355. return entry
  356. }
  357. // parseRequestComponents parses method, path, and protocol from request string
  358. func (sm *SIMDStringMatcher) parseRequestComponents(request string, entry *AccessLogEntry) {
  359. requestBytes := []byte(request)
  360. // Find first space (after method)
  361. firstSpace := sm.FindNextSpace(requestBytes, 0)
  362. if firstSpace == -1 {
  363. return
  364. }
  365. // Extract method
  366. entry.Method = unsafeBytesToString(requestBytes[:firstSpace])
  367. // Find second space (after path)
  368. secondSpace := sm.FindNextSpace(requestBytes, firstSpace+1)
  369. if secondSpace == -1 {
  370. // Only method and path, no protocol
  371. entry.Path = unsafeBytesToString(requestBytes[firstSpace+1:])
  372. return
  373. }
  374. // Extract path and protocol
  375. entry.Path = unsafeBytesToString(requestBytes[firstSpace+1 : secondSpace])
  376. entry.Protocol = unsafeBytesToString(requestBytes[secondSpace+1:])
  377. }
  378. // skipUserFields skips the user fields (typically "- -")
  379. func (sm *SIMDStringMatcher) skipUserFields(data []byte, start int) int {
  380. pos := start
  381. spaceCount := 0
  382. for pos < len(data) && spaceCount < 2 {
  383. if sm.spaceLookup[data[pos]] {
  384. spaceCount++
  385. }
  386. pos++
  387. }
  388. if spaceCount < 2 {
  389. return -1
  390. }
  391. return pos
  392. }
  393. // extractSize extracts size field (number or "-")
  394. func (sm *SIMDStringMatcher) extractSize(data []byte, start int) (int64, int) {
  395. // Skip leading spaces
  396. pos := start
  397. for pos < len(data) && sm.spaceLookup[data[pos]] {
  398. pos++
  399. }
  400. if pos >= len(data) {
  401. return 0, -1
  402. }
  403. // Check for "-" (no size)
  404. if data[pos] == '-' {
  405. return 0, pos + 1
  406. }
  407. // Extract numeric size
  408. sizeStart := pos
  409. for pos < len(data) && sm.digitLookup[data[pos]] {
  410. pos++
  411. }
  412. if pos == sizeStart {
  413. return 0, -1
  414. }
  415. // Fast integer conversion
  416. var size int64
  417. for i := sizeStart; i < pos; i++ {
  418. size = size*10 + int64(data[i]-'0')
  419. }
  420. return size, pos
  421. }
  422. // BatchParseSIMD parses multiple log lines using SIMD optimizations
  423. func (sm *SIMDStringMatcher) BatchParseSIMD(lines [][]byte) []*AccessLogEntry {
  424. entries := make([]*AccessLogEntry, 0, len(lines))
  425. for _, line := range lines {
  426. if entry := sm.ParseLogLineSIMD(line); entry != nil {
  427. entries = append(entries, entry)
  428. }
  429. }
  430. return entries
  431. }
  432. // OptimizedLogLineParser provides a high-performance parser using SIMD operations
  433. type OptimizedLogLineParser struct {
  434. matcher *SIMDStringMatcher
  435. pool *AccessLogEntryPool
  436. }
  437. // NewOptimizedLogLineParser creates a new optimized parser
  438. func NewOptimizedLogLineParser() *OptimizedLogLineParser {
  439. return &OptimizedLogLineParser{
  440. matcher: NewSIMDStringMatcher(),
  441. pool: NewAccessLogEntryPool(),
  442. }
  443. }
  444. // ParseLine parses a single log line with maximum performance
  445. func (olp *OptimizedLogLineParser) ParseLine(data []byte) *AccessLogEntry {
  446. return olp.matcher.ParseLogLineSIMD(data)
  447. }
  448. // ParseLines parses multiple lines efficiently
  449. func (olp *OptimizedLogLineParser) ParseLines(lines [][]byte) []*AccessLogEntry {
  450. return olp.matcher.BatchParseSIMD(lines)
  451. }
  452. // AccessLogEntryPool provides object pooling for AccessLogEntry
  453. type AccessLogEntryPool struct {
  454. entries chan *AccessLogEntry
  455. }
  456. // NewAccessLogEntryPool creates a new object pool
  457. func NewAccessLogEntryPool() *AccessLogEntryPool {
  458. pool := &AccessLogEntryPool{
  459. entries: make(chan *AccessLogEntry, 1000),
  460. }
  461. // Pre-populate pool
  462. for i := 0; i < 100; i++ {
  463. pool.entries <- &AccessLogEntry{}
  464. }
  465. return pool
  466. }
  467. // Get retrieves an entry from the pool
  468. func (pool *AccessLogEntryPool) Get() *AccessLogEntry {
  469. select {
  470. case entry := <-pool.entries:
  471. return entry
  472. default:
  473. return &AccessLogEntry{}
  474. }
  475. }
  476. // Put returns an entry to the pool
  477. func (pool *AccessLogEntryPool) Put(entry *AccessLogEntry) {
  478. // Reset entry fields
  479. *entry = AccessLogEntry{}
  480. select {
  481. case pool.entries <- entry:
  482. default:
  483. // Pool is full, let GC handle it
  484. }
  485. }