1
0

regex_cache_optimization.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. package parser
  2. import (
  3. "regexp"
  4. "sync"
  5. "time"
  6. )
  7. // RegexCache provides high-performance compiled regex caching
  8. type RegexCache struct {
  9. cache map[string]*CachedRegex
  10. mutex sync.RWMutex
  11. maxSize int
  12. ttl time.Duration
  13. hits int64
  14. misses int64
  15. cleanupTicker *time.Ticker
  16. stopCleanup chan struct{}
  17. }
  18. // CachedRegex represents a compiled regex with metadata
  19. type CachedRegex struct {
  20. regex *regexp.Regexp
  21. pattern string
  22. compiledAt time.Time
  23. lastUsed time.Time
  24. useCount int64
  25. }
  26. // RegexCacheStats provides cache statistics
  27. type RegexCacheStats struct {
  28. Size int `json:"size"`
  29. MaxSize int `json:"max_size"`
  30. Hits int64 `json:"hits"`
  31. Misses int64 `json:"misses"`
  32. HitRate float64 `json:"hit_rate"`
  33. TTL string `json:"ttl"`
  34. }
  35. // Common nginx log parsing patterns - pre-compiled for performance
  36. var commonPatterns = map[string]string{
  37. "ipv4": `(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})`,
  38. "ipv6": `([0-9a-fA-F:]+:+[0-9a-fA-F:]*[0-9a-fA-F]+)`,
  39. "timestamp": `\[([^\]]+)\]`,
  40. "method": `"([A-Z]+)\s+`,
  41. "path": `\s+([^\s?"]+)`,
  42. "protocol": `\s+(HTTP/[0-9.]+)"`,
  43. "status": `"\s+(\d{3})\s+`,
  44. "size": `\s+(\d+|-)`,
  45. "referer": `"([^"]*)"`,
  46. "user_agent": `"([^"]*)"`,
  47. "request_time": `\s+([\d.]+)$`,
  48. "upstream_time": `\s+([\d.]+)\s*$`,
  49. "combined_format": `^(\S+)\s+\S+\s+\S+\s+\[([^\]]+)\]\s+"([^"]+)"\s+(\d+)\s+(\d+|-)(?:\s+"([^"]*)")?(?:\s+"([^"]*)")?(?:\s+([\d.]+))?(?:\s+([\d.]+))?`,
  50. "main_format": `^(\S+)\s+\S+\s+\S+\s+\[([^\]]+)\]\s+"([^"]+)"\s+(\d+)\s+(\d+|-)`,
  51. }
  52. // Global regex cache instance
  53. var globalRegexCache *RegexCache
  54. var regexCacheOnce sync.Once
  55. // GetGlobalRegexCache returns the global regex cache instance
  56. func GetGlobalRegexCache() *RegexCache {
  57. regexCacheOnce.Do(func() {
  58. globalRegexCache = NewRegexCache(1000, 24*time.Hour) // 1000 patterns, 24h TTL
  59. globalRegexCache.PrecompileCommonPatterns()
  60. })
  61. return globalRegexCache
  62. }
  63. // NewRegexCache creates a new regex cache with the specified parameters
  64. func NewRegexCache(maxSize int, ttl time.Duration) *RegexCache {
  65. cache := &RegexCache{
  66. cache: make(map[string]*CachedRegex),
  67. maxSize: maxSize,
  68. ttl: ttl,
  69. stopCleanup: make(chan struct{}),
  70. }
  71. // Start cleanup routine
  72. cache.cleanupTicker = time.NewTicker(ttl / 4) // Clean every quarter of TTL
  73. go cache.cleanupRoutine()
  74. return cache
  75. }
  76. // PrecompileCommonPatterns pre-compiles common nginx log parsing patterns
  77. func (rc *RegexCache) PrecompileCommonPatterns() {
  78. for name, pattern := range commonPatterns {
  79. regex, err := regexp.Compile(pattern)
  80. if err != nil {
  81. continue // Skip invalid patterns
  82. }
  83. rc.mutex.Lock()
  84. rc.cache[name] = &CachedRegex{
  85. regex: regex,
  86. pattern: pattern,
  87. compiledAt: time.Now(),
  88. lastUsed: time.Now(),
  89. useCount: 0,
  90. }
  91. rc.mutex.Unlock()
  92. }
  93. }
  94. // GetRegex retrieves or compiles a regex pattern
  95. func (rc *RegexCache) GetRegex(pattern string) (*regexp.Regexp, error) {
  96. // Try to get from cache first
  97. rc.mutex.RLock()
  98. cached, exists := rc.cache[pattern]
  99. if exists {
  100. // Check if not expired
  101. if time.Since(cached.compiledAt) < rc.ttl {
  102. cached.lastUsed = time.Now()
  103. cached.useCount++
  104. rc.hits++
  105. rc.mutex.RUnlock()
  106. return cached.regex, nil
  107. }
  108. }
  109. rc.mutex.RUnlock()
  110. // Cache miss or expired - compile new regex
  111. regex, err := regexp.Compile(pattern)
  112. if err != nil {
  113. rc.mutex.Lock()
  114. rc.misses++
  115. rc.mutex.Unlock()
  116. return nil, err
  117. }
  118. // Store in cache
  119. rc.mutex.Lock()
  120. defer rc.mutex.Unlock()
  121. // Check if cache is full
  122. if len(rc.cache) >= rc.maxSize {
  123. rc.evictLeastUsed()
  124. }
  125. rc.cache[pattern] = &CachedRegex{
  126. regex: regex,
  127. pattern: pattern,
  128. compiledAt: time.Now(),
  129. lastUsed: time.Now(),
  130. useCount: 1,
  131. }
  132. rc.misses++
  133. return regex, nil
  134. }
  135. // GetCommonRegex retrieves a pre-compiled common pattern
  136. func (rc *RegexCache) GetCommonRegex(patternName string) (*regexp.Regexp, bool) {
  137. pattern, exists := commonPatterns[patternName]
  138. if !exists {
  139. return nil, false
  140. }
  141. regex, err := rc.GetRegex(pattern)
  142. if err != nil {
  143. return nil, false
  144. }
  145. return regex, true
  146. }
  147. // evictLeastUsed removes the least recently used entry from cache
  148. func (rc *RegexCache) evictLeastUsed() {
  149. var oldestKey string
  150. var oldestTime time.Time
  151. var lowestCount int64 = -1
  152. for key, cached := range rc.cache {
  153. if lowestCount == -1 || cached.useCount < lowestCount {
  154. lowestCount = cached.useCount
  155. oldestKey = key
  156. oldestTime = cached.lastUsed
  157. } else if cached.useCount == lowestCount && cached.lastUsed.Before(oldestTime) {
  158. oldestKey = key
  159. oldestTime = cached.lastUsed
  160. }
  161. }
  162. if oldestKey != "" {
  163. delete(rc.cache, oldestKey)
  164. }
  165. }
  166. // cleanupRoutine periodically removes expired entries
  167. func (rc *RegexCache) cleanupRoutine() {
  168. for {
  169. select {
  170. case <-rc.cleanupTicker.C:
  171. rc.cleanup()
  172. case <-rc.stopCleanup:
  173. rc.cleanupTicker.Stop()
  174. return
  175. }
  176. }
  177. }
  178. // cleanup removes expired entries from the cache
  179. func (rc *RegexCache) cleanup() {
  180. rc.mutex.Lock()
  181. defer rc.mutex.Unlock()
  182. now := time.Now()
  183. for key, cached := range rc.cache {
  184. if now.Sub(cached.compiledAt) > rc.ttl {
  185. delete(rc.cache, key)
  186. }
  187. }
  188. }
  189. // GetStats returns cache statistics
  190. func (rc *RegexCache) GetStats() RegexCacheStats {
  191. rc.mutex.RLock()
  192. defer rc.mutex.RUnlock()
  193. total := rc.hits + rc.misses
  194. var hitRate float64
  195. if total > 0 {
  196. hitRate = float64(rc.hits) / float64(total)
  197. }
  198. return RegexCacheStats{
  199. Size: len(rc.cache),
  200. MaxSize: rc.maxSize,
  201. Hits: rc.hits,
  202. Misses: rc.misses,
  203. HitRate: hitRate,
  204. TTL: rc.ttl.String(),
  205. }
  206. }
  207. // Clear clears all cached regexes
  208. func (rc *RegexCache) Clear() {
  209. rc.mutex.Lock()
  210. defer rc.mutex.Unlock()
  211. rc.cache = make(map[string]*CachedRegex)
  212. rc.hits = 0
  213. rc.misses = 0
  214. }
  215. // Close stops the cleanup routine and clears the cache
  216. func (rc *RegexCache) Close() {
  217. close(rc.stopCleanup)
  218. rc.Clear()
  219. }
  220. // OptimizedRegexMatcher provides optimized regex matching for log parsing
  221. type OptimizedRegexMatcher struct {
  222. cache *RegexCache
  223. // Pre-compiled common patterns for fastest access
  224. ipv4Regex *regexp.Regexp
  225. timestampRegex *regexp.Regexp
  226. methodRegex *regexp.Regexp
  227. statusRegex *regexp.Regexp
  228. combinedRegex *regexp.Regexp
  229. mainRegex *regexp.Regexp
  230. }
  231. // NewOptimizedRegexMatcher creates a new optimized regex matcher
  232. func NewOptimizedRegexMatcher() *OptimizedRegexMatcher {
  233. cache := GetGlobalRegexCache()
  234. matcher := &OptimizedRegexMatcher{
  235. cache: cache,
  236. }
  237. // Pre-compile most common patterns for direct access
  238. matcher.ipv4Regex, _ = cache.GetCommonRegex("ipv4")
  239. matcher.timestampRegex, _ = cache.GetCommonRegex("timestamp")
  240. matcher.methodRegex, _ = cache.GetCommonRegex("method")
  241. matcher.statusRegex, _ = cache.GetCommonRegex("status")
  242. matcher.combinedRegex, _ = cache.GetCommonRegex("combined_format")
  243. matcher.mainRegex, _ = cache.GetCommonRegex("main_format")
  244. return matcher
  245. }
  246. // MatchIPv4 matches IPv4 addresses using cached regex
  247. func (orm *OptimizedRegexMatcher) MatchIPv4(text string) []string {
  248. if orm.ipv4Regex != nil {
  249. return orm.ipv4Regex.FindStringSubmatch(text)
  250. }
  251. return nil
  252. }
  253. // MatchTimestamp matches timestamp patterns using cached regex
  254. func (orm *OptimizedRegexMatcher) MatchTimestamp(text string) []string {
  255. if orm.timestampRegex != nil {
  256. return orm.timestampRegex.FindStringSubmatch(text)
  257. }
  258. return nil
  259. }
  260. // MatchCombinedFormat matches complete combined log format
  261. func (orm *OptimizedRegexMatcher) MatchCombinedFormat(text string) []string {
  262. if orm.combinedRegex != nil {
  263. return orm.combinedRegex.FindStringSubmatch(text)
  264. }
  265. return nil
  266. }
  267. // MatchMainFormat matches main log format
  268. func (orm *OptimizedRegexMatcher) MatchMainFormat(text string) []string {
  269. if orm.mainRegex != nil {
  270. return orm.mainRegex.FindStringSubmatch(text)
  271. }
  272. return nil
  273. }
  274. // MatchPattern matches any pattern using the regex cache
  275. func (orm *OptimizedRegexMatcher) MatchPattern(pattern, text string) ([]string, error) {
  276. regex, err := orm.cache.GetRegex(pattern)
  277. if err != nil {
  278. return nil, err
  279. }
  280. return regex.FindStringSubmatch(text), nil
  281. }
  282. // DetectLogFormat detects nginx log format using cached patterns
  283. func (orm *OptimizedRegexMatcher) DetectLogFormat(logLine string) string {
  284. // Try combined format first (most common)
  285. if orm.combinedRegex != nil && orm.combinedRegex.MatchString(logLine) {
  286. return "combined"
  287. }
  288. // Try main format
  289. if orm.mainRegex != nil && orm.mainRegex.MatchString(logLine) {
  290. return "main"
  291. }
  292. return "unknown"
  293. }
  294. // GetCacheStats returns regex cache statistics
  295. func (orm *OptimizedRegexMatcher) GetCacheStats() RegexCacheStats {
  296. return orm.cache.GetStats()
  297. }
  298. // FastLogFormatDetector provides ultra-fast log format detection
  299. type FastLogFormatDetector struct {
  300. combinedRegex *regexp.Regexp
  301. mainRegex *regexp.Regexp
  302. // Pre-computed patterns for fastest detection
  303. combinedPatternBytes []byte
  304. mainPatternBytes []byte
  305. }
  306. // NewFastLogFormatDetector creates a new fast log format detector
  307. func NewFastLogFormatDetector() *FastLogFormatDetector {
  308. cache := GetGlobalRegexCache()
  309. detector := &FastLogFormatDetector{}
  310. detector.combinedRegex, _ = cache.GetCommonRegex("combined_format")
  311. detector.mainRegex, _ = cache.GetCommonRegex("main_format")
  312. // Pre-compute pattern signatures for ultra-fast detection
  313. detector.combinedPatternBytes = []byte(`" `) // Look for quotes and spaces
  314. detector.mainPatternBytes = []byte(`[`) // Look for bracket patterns
  315. return detector
  316. }
  317. // DetectFormat detects log format with minimal overhead
  318. func (flfd *FastLogFormatDetector) DetectFormat(logLine []byte) string {
  319. // Quick heuristic checks first (much faster than regex)
  320. quoteCount := 0
  321. bracketCount := 0
  322. for _, b := range logLine {
  323. switch b {
  324. case '"':
  325. quoteCount++
  326. case '[', ']':
  327. bracketCount++
  328. }
  329. // Early termination - if we have enough quotes, likely combined format
  330. if quoteCount >= 4 {
  331. return "combined"
  332. }
  333. }
  334. // If we have brackets but few quotes, likely main format
  335. if bracketCount >= 2 && quoteCount < 4 {
  336. return "main"
  337. }
  338. // Fallback to regex matching for edge cases
  339. logLineStr := string(logLine)
  340. if flfd.combinedRegex != nil && flfd.combinedRegex.MatchString(logLineStr) {
  341. return "combined"
  342. }
  343. if flfd.mainRegex != nil && flfd.mainRegex.MatchString(logLineStr) {
  344. return "main"
  345. }
  346. return "unknown"
  347. }
  348. // PatternPool manages a pool of compiled patterns for high-concurrency usage
  349. type PatternPool struct {
  350. patterns map[string]*sync.Pool
  351. mutex sync.RWMutex
  352. }
  353. // NewPatternPool creates a new pattern pool
  354. func NewPatternPool() *PatternPool {
  355. return &PatternPool{
  356. patterns: make(map[string]*sync.Pool),
  357. }
  358. }
  359. // GetPattern gets a regex from the pool (creates if not exists)
  360. func (pp *PatternPool) GetPattern(pattern string) (*regexp.Regexp, error) {
  361. pp.mutex.RLock()
  362. pool, exists := pp.patterns[pattern]
  363. pp.mutex.RUnlock()
  364. if !exists {
  365. // Create new pool for this pattern
  366. regex, err := regexp.Compile(pattern)
  367. if err != nil {
  368. return nil, err
  369. }
  370. newPool := &sync.Pool{
  371. New: func() interface{} {
  372. // Reuse compiled regex; in Go 1.12+ it's safe for concurrent use
  373. return regex
  374. },
  375. }
  376. pp.mutex.Lock()
  377. pp.patterns[pattern] = newPool
  378. pool = newPool
  379. pp.mutex.Unlock()
  380. }
  381. return pool.Get().(*regexp.Regexp), nil
  382. }
  383. // PutPattern returns a regex to the pool
  384. func (pp *PatternPool) PutPattern(pattern string, regex *regexp.Regexp) {
  385. pp.mutex.RLock()
  386. pool, exists := pp.patterns[pattern]
  387. pp.mutex.RUnlock()
  388. if exists {
  389. pool.Put(regex)
  390. }
  391. }