parse_ngx_index.go 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. package main
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "net/http"
  6. "os"
  7. "strings"
  8. "golang.org/x/net/html"
  9. )
  10. type Directive struct {
  11. Links []string `json:"links"`
  12. }
  13. func main() {
  14. // Fetch page content
  15. resp, err := http.Get("https://nginx.org/en/docs/dirindex.html")
  16. if err != nil {
  17. fmt.Println("Error fetching page:", err)
  18. return
  19. }
  20. defer resp.Body.Close()
  21. // Parse HTML
  22. doc, err := html.Parse(resp.Body)
  23. if err != nil {
  24. fmt.Println("Error parsing HTML:", err)
  25. return
  26. }
  27. // Change storage structure to map
  28. directives := make(map[string]Directive)
  29. // Find node with id="content"
  30. var content *html.Node
  31. var findContent func(*html.Node)
  32. findContent = func(n *html.Node) {
  33. if n.Type == html.ElementNode && n.Data == "div" {
  34. for _, attr := range n.Attr {
  35. if attr.Key == "id" && attr.Val == "content" {
  36. content = n
  37. return
  38. }
  39. }
  40. }
  41. for c := n.FirstChild; c != nil; c = c.NextSibling {
  42. findContent(c)
  43. }
  44. }
  45. findContent(doc)
  46. // Extract all a tags from content
  47. if content != nil {
  48. var extractLinks func(*html.Node)
  49. extractLinks = func(n *html.Node) {
  50. if n.Type == html.ElementNode && n.Data == "a" {
  51. var href string
  52. for _, attr := range n.Attr {
  53. if attr.Key == "href" {
  54. href = attr.Val
  55. break
  56. }
  57. }
  58. if href != "" && n.FirstChild != nil {
  59. name := strings.TrimSpace(n.FirstChild.Data)
  60. if name != "" {
  61. fullLink := "https://nginx.org/en/docs/" + href
  62. directive, exists := directives[name]
  63. if !exists {
  64. directives[name] = Directive{
  65. Links: []string{fullLink},
  66. }
  67. } else {
  68. // Check if link already exists to avoid duplicates
  69. linkExists := false
  70. for _, existingLink := range directive.Links {
  71. if existingLink == fullLink {
  72. linkExists = true
  73. break
  74. }
  75. }
  76. if !linkExists {
  77. directive.Links = append(directive.Links, fullLink)
  78. directives[name] = directive
  79. }
  80. }
  81. }
  82. }
  83. }
  84. for c := n.FirstChild; c != nil; c = c.NextSibling {
  85. extractLinks(c)
  86. }
  87. }
  88. extractLinks(content)
  89. }
  90. // Write results to JSON file
  91. jsonData, err := json.MarshalIndent(directives, "", " ")
  92. if err != nil {
  93. fmt.Println("Error marshaling JSON:", err)
  94. return
  95. }
  96. err = os.WriteFile("../../internal/nginx/nginx_directives.json", jsonData, 0644)
  97. if err != nil {
  98. fmt.Println("Error writing file:", err)
  99. return
  100. }
  101. fmt.Printf("Successfully parsed %d directives and saved to nginx_directives.json\n", len(directives))
  102. }