formats.go 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. package parser
  2. import (
  3. "regexp"
  4. )
  5. // Common nginx log formats
  6. var (
  7. // CombinedFormat Standard combined log format
  8. CombinedFormat = &LogFormat{
  9. Name: "combined",
  10. Pattern: regexp.MustCompile(`^(\S+) - (\S+) \[([^]]+)\] "([^"]*)" (\d+) (\d+|-) "([^"]*)" "([^"]*)"(?:\s+(\S+))?(?:\s+(\S+))?`),
  11. Fields: []string{"ip", "remote_user", "timestamp", "request", "status", "bytes_sent", "referer", "user_agent", "request_time", "upstream_time"},
  12. }
  13. // MainFormat Standard main log format (common log format)
  14. MainFormat = &LogFormat{
  15. Name: "main",
  16. Pattern: regexp.MustCompile(`^(\S+) - (\S+) \[([^]]+)\] "([^"]*)" (\d+) (\d+|-)(?:\s+"([^"]*)")?(?:\s+"([^"]*)")?`),
  17. Fields: []string{"ip", "remote_user", "timestamp", "request", "status", "bytes_sent", "referer", "user_agent"},
  18. }
  19. // DetailedFormat Custom format with more details
  20. DetailedFormat = &LogFormat{
  21. Name: "detailed",
  22. Pattern: regexp.MustCompile(`^(\S+) - (\S+) \[([^]]+)\] "([^"]*)" (\d+) (\d+|-) "([^"]*)" "([^"]*)" (\S+) (\S+) "([^"]*)" (\S+)`),
  23. Fields: []string{"ip", "remote_user", "timestamp", "request", "status", "bytes_sent", "referer", "user_agent", "request_time", "upstream_time", "x_forwarded_for", "connection"},
  24. }
  25. // SupportedFormats All supported formats ordered by priority
  26. SupportedFormats = []*LogFormat{DetailedFormat, CombinedFormat, MainFormat}
  27. )
  28. // FormatDetector handles automatic log format detection
  29. type FormatDetector struct {
  30. formats []*LogFormat
  31. sampleSize int
  32. matchThreshold float64
  33. }
  34. // NewFormatDetector creates a new format detector
  35. func NewFormatDetector() *FormatDetector {
  36. return &FormatDetector{
  37. formats: SupportedFormats,
  38. sampleSize: 100,
  39. matchThreshold: 0.8, // 80% match rate required
  40. }
  41. }
  42. // DetectFormat tries to detect the log format from sample lines
  43. func (fd *FormatDetector) DetectFormat(lines []string) *LogFormat {
  44. if len(lines) == 0 {
  45. return nil
  46. }
  47. sampleLines := lines
  48. if len(lines) > fd.sampleSize {
  49. sampleLines = lines[:fd.sampleSize]
  50. }
  51. for _, format := range fd.formats {
  52. matchCount := 0
  53. for _, line := range sampleLines {
  54. if format.Pattern.MatchString(line) {
  55. matchCount++
  56. }
  57. }
  58. matchRate := float64(matchCount) / float64(len(sampleLines))
  59. if matchRate >= fd.matchThreshold {
  60. return format
  61. }
  62. }
  63. return nil
  64. }
  65. // DetectFormatWithDetails returns detailed detection results
  66. func (fd *FormatDetector) DetectFormatWithDetails(lines []string) (*LogFormat, map[string]float64) {
  67. if len(lines) == 0 {
  68. return nil, nil
  69. }
  70. sampleLines := lines
  71. if len(lines) > fd.sampleSize {
  72. sampleLines = lines[:fd.sampleSize]
  73. }
  74. results := make(map[string]float64)
  75. var bestFormat *LogFormat
  76. var bestScore float64
  77. for _, format := range fd.formats {
  78. matchCount := 0
  79. for _, line := range sampleLines {
  80. if format.Pattern.MatchString(line) {
  81. matchCount++
  82. }
  83. }
  84. score := float64(matchCount) / float64(len(sampleLines))
  85. results[format.Name] = score
  86. if score > bestScore {
  87. bestScore = score
  88. bestFormat = format
  89. }
  90. }
  91. if bestScore >= fd.matchThreshold {
  92. return bestFormat, results
  93. }
  94. return nil, results
  95. }
  96. // AddCustomFormat adds a custom log format to the detector
  97. func (fd *FormatDetector) AddCustomFormat(format *LogFormat) {
  98. fd.formats = append([]*LogFormat{format}, fd.formats...)
  99. }
  100. // SetMatchThreshold sets the minimum match rate required for format detection
  101. func (fd *FormatDetector) SetMatchThreshold(threshold float64) {
  102. if threshold > 0 && threshold <= 1 {
  103. fd.matchThreshold = threshold
  104. }
  105. }
  106. // SetSampleSize sets the number of lines to use for format detection
  107. func (fd *FormatDetector) SetSampleSize(size int) {
  108. if size > 0 {
  109. fd.sampleSize = size
  110. }
  111. }