Browse Source

dashboard async, clickable datapoints that go to commit url

Alex Cheema 5 months ago
parent
commit
b000d23b2a
1 changed files with 153 additions and 113 deletions
  1. 153 113
      extra/dashboard/dashboard.py

+ 153 - 113
extra/dashboard/dashboard.py

@@ -1,14 +1,15 @@
 import os
 import os
 import json
 import json
 import logging
 import logging
-import requests
+import asyncio
+import aiohttp
 import pandas as pd
 import pandas as pd
 import plotly.express as px
 import plotly.express as px
 from datetime import datetime
 from datetime import datetime
 from typing import List, Dict, Optional
 from typing import List, Dict, Optional
 from pathlib import Path
 from pathlib import Path
 
 
-class CircleCIClient:
+class AsyncCircleCIClient:
     def __init__(self, token: str, project_slug: str):
     def __init__(self, token: str, project_slug: str):
         self.token = token
         self.token = token
         self.project_slug = project_slug
         self.project_slug = project_slug
@@ -19,66 +20,57 @@ class CircleCIClient:
         }
         }
         self.logger = logging.getLogger("CircleCI")
         self.logger = logging.getLogger("CircleCI")
 
 
-    def get_recent_pipelines(self, limit: int = 25) -> List[Dict]:
+    async def get_json(self, session: aiohttp.ClientSession, url: str, params: Dict = None) -> Dict:
+        async with session.get(url, params=params) as response:
+            response.raise_for_status()
+            return await response.json()
+
+    async def get_recent_pipelines(self, session: aiohttp.ClientSession, limit: int = 50) -> List[Dict]:
         self.logger.info(f"Fetching {limit} recent pipelines...")
         self.logger.info(f"Fetching {limit} recent pipelines...")
         url = f"{self.base_url}/project/{self.project_slug}/pipeline"
         url = f"{self.base_url}/project/{self.project_slug}/pipeline"
-        params = {"limit": limit * 2}  # Fetch extra to account for failed builds
-
-        response = requests.get(url, headers=self.headers, params=params)
-        response.raise_for_status()
-
-        pipelines = [p for p in response.json()["items"] if p["state"] == "created"]
-        pipelines = pipelines[:limit]
-        self.logger.info(f"Found {len(pipelines)} successful pipelines")
-
-        # Fetch additional data for each pipeline
-        detailed_pipelines = []
-        for pipeline in pipelines:
-            try:
-                url = f"{self.base_url}/pipeline/{pipeline['id']}"
-                response = requests.get(url, headers=self.headers)
-                response.raise_for_status()
-                detailed_pipelines.append(response.json())
-            except Exception as e:
-                self.logger.warning(f"Could not fetch details for pipeline {pipeline['id']}: {e}")
-                continue
+        params = {"limit": limit * 2}
 
 
-        return detailed_pipelines
+        data = await self.get_json(session, url, params)
+        pipelines = [
+            p for p in data["items"]
+            if p["state"] == "created"
+            and p.get("trigger_parameters", {}).get("git", {}).get("branch") == "main"
+        ][:limit]
 
 
-    def get_workflow_jobs(self, pipeline_id: str) -> List[Dict]:
+        self.logger.info(f"Found {len(pipelines)} successful main branch pipelines")
+        return pipelines
+
+    async def get_workflow_jobs(self, session: aiohttp.ClientSession, pipeline_id: str) -> List[Dict]:
         self.logger.debug(f"Fetching workflows for pipeline {pipeline_id}")
         self.logger.debug(f"Fetching workflows for pipeline {pipeline_id}")
         url = f"{self.base_url}/pipeline/{pipeline_id}/workflow"
         url = f"{self.base_url}/pipeline/{pipeline_id}/workflow"
-        response = requests.get(url, headers=self.headers)
-        response.raise_for_status()
-        workflows = response.json()["items"]
+        workflows_data = await self.get_json(session, url)
+        workflows = workflows_data["items"]
 
 
-        jobs = []
+        # Fetch all jobs for all workflows in parallel
+        jobs_tasks = []
         for workflow in workflows:
         for workflow in workflows:
-            self.logger.debug(f"Fetching jobs for workflow {workflow['id']}")
             url = f"{self.base_url}/workflow/{workflow['id']}/job"
             url = f"{self.base_url}/workflow/{workflow['id']}/job"
-            response = requests.get(url, headers=self.headers)
-            response.raise_for_status()
-            jobs.extend(response.json()["items"])
+            jobs_tasks.append(self.get_json(session, url))
 
 
-        return jobs
+        jobs_responses = await asyncio.gather(*jobs_tasks, return_exceptions=True)
 
 
-    def get_artifacts(self, job_number: str) -> List[Dict]:
-        self.logger.debug(f"Fetching artifacts for job {job_number}")
-        url = f"{self.base_url}/project/{self.project_slug}/{job_number}/artifacts"
-        response = requests.get(url, headers=self.headers)
-        response.raise_for_status()
-        return response.json()["items"]
+        all_jobs = []
+        for jobs_data in jobs_responses:
+            if isinstance(jobs_data, Exception):
+                continue
+            all_jobs.extend(jobs_data["items"])
+
+        return all_jobs
 
 
-    def download_artifact(self, artifact_url: str) -> Dict:
-        self.logger.debug(f"Downloading artifact from {artifact_url}")
-        response = requests.get(artifact_url, headers=self.headers)
-        response.raise_for_status()
-        return response.json()
+    async def get_artifacts(self, session: aiohttp.ClientSession, job_number: str) -> List[Dict]:
+        url = f"{self.base_url}/project/{self.project_slug}/{job_number}/artifacts"
+        data = await self.get_json(session, url)
+        return data["items"]
 
 
 class PackageSizeTracker:
 class PackageSizeTracker:
     def __init__(self, token: str, project_slug: str, debug: bool = False):
     def __init__(self, token: str, project_slug: str, debug: bool = False):
         self.setup_logging(debug)
         self.setup_logging(debug)
-        self.client = CircleCIClient(token, project_slug)
+        self.client = AsyncCircleCIClient(token, project_slug)
         self.logger = logging.getLogger("PackageSizeTracker")
         self.logger = logging.getLogger("PackageSizeTracker")
 
 
     def setup_logging(self, debug: bool):
     def setup_logging(self, debug: bool):
@@ -89,80 +81,92 @@ class PackageSizeTracker:
             datefmt='%H:%M:%S'
             datefmt='%H:%M:%S'
         )
         )
 
 
-    def extract_commit_info(self, pipeline: Dict) -> Optional[str]:
-        """Extract commit hash from pipeline data structure"""
+    def extract_commit_info(self, pipeline: Dict) -> Optional[Dict]:
         try:
         try:
-            # Try to get commit hash from trigger parameters
             if 'trigger_parameters' in pipeline:
             if 'trigger_parameters' in pipeline:
                 github_app = pipeline['trigger_parameters'].get('github_app', {})
                 github_app = pipeline['trigger_parameters'].get('github_app', {})
                 if github_app:
                 if github_app:
-                    return github_app.get('commit_sha')
+                    return {
+                        'commit_hash': github_app.get('checkout_sha'),
+                        'web_url': f"{github_app.get('repo_url')}/commit/{github_app.get('checkout_sha')}"
+                    }
 
 
-                # Fallback to git parameters if github_app is not available
                 git_params = pipeline['trigger_parameters'].get('git', {})
                 git_params = pipeline['trigger_parameters'].get('git', {})
                 if git_params:
                 if git_params:
-                    return git_params.get('checkout_sha')
+                    return {
+                        'commit_hash': git_params.get('checkout_sha'),
+                        'web_url': f"{git_params.get('repo_url')}/commit/{git_params.get('checkout_sha')}"
+                    }
 
 
-            self.logger.warning(f"Could not find commit hash in pipeline {pipeline['id']}")
+            self.logger.warning(f"Could not find commit info in pipeline {pipeline['id']}")
             return None
             return None
 
 
         except Exception as e:
         except Exception as e:
             self.logger.error(f"Error extracting commit info: {str(e)}")
             self.logger.error(f"Error extracting commit info: {str(e)}")
             return None
             return None
 
 
-    def collect_data(self) -> List[Dict]:
-        self.logger.info("Starting data collection...")
-        pipelines = self.client.get_recent_pipelines(25)
-
-        data_points = []
-        for pipeline in pipelines:
-            try:
-                self.logger.debug(f"Processing pipeline {pipeline['id']}")
+    async def process_pipeline(self, session: aiohttp.ClientSession, pipeline: Dict) -> Optional[Dict]:
+        try:
+            commit_info = self.extract_commit_info(pipeline)
+            if not commit_info:
+                return None
+
+            jobs = await self.client.get_workflow_jobs(session, pipeline["id"])
+            size_job = next(
+                (j for j in jobs if j["name"] == "measure_pip_sizes" and j["status"] == "success"),
+                None
+            )
+
+            if not size_job:
+                self.logger.debug(f"No measure_pip_sizes job found for pipeline {pipeline['id']}")
+                return None
+
+            artifacts = await self.client.get_artifacts(session, size_job["job_number"])
+            size_report = next(
+                (a for a in artifacts if a["path"].endswith("pip-sizes.json")),
+                None
+            )
+
+            if not size_report:
+                self.logger.debug(f"No pip-sizes.json artifact found for job {size_job['job_number']}")
+                return None
+
+            json_data = await self.client.get_json(session, size_report["url"])
+            data_point = {
+                "commit_hash": commit_info['commit_hash'],
+                "commit_url": commit_info['web_url'],
+                "timestamp": pipeline.get("created_at", pipeline.get("updated_at")),
+                "total_size_mb": json_data["total_size_mb"],
+                "packages": json_data["packages"]
+            }
+
+            self.logger.info(
+                f"Processed pipeline {pipeline['id']}: "
+                f"commit {commit_info['commit_hash'][:7]}, "
+                f"size {json_data['total_size_mb']:.2f}MB"
+            )
+            return data_point
 
 
-                # Extract commit hash
-                commit_hash = self.extract_commit_info(pipeline)
-                if not commit_hash:
-                    continue
+        except Exception as e:
+            self.logger.error(f"Error processing pipeline {pipeline['id']}: {str(e)}")
+            return None
 
 
-                jobs = self.client.get_workflow_jobs(pipeline["id"])
+    async def collect_data(self) -> List[Dict]:
+        self.logger.info("Starting data collection...")
+        async with aiohttp.ClientSession(headers=self.client.headers) as session:
+            # Get pipelines
+            pipelines = await self.client.get_recent_pipelines(session, 50)
 
 
-                size_job = next(
-                    (j for j in jobs if j["name"] == "measure_pip_sizes" and j["status"] == "success"),
-                    None
-                )
+            # Process all pipelines in parallel
+            tasks = [self.process_pipeline(session, pipeline) for pipeline in pipelines]
+            results = await asyncio.gather(*tasks)
 
 
-                if size_job:
-                    artifacts = self.client.get_artifacts(size_job["job_number"])
-                    size_report = next(
-                        (a for a in artifacts if a["path"].endswith("pip-sizes.json")),
-                        None
-                    )
-
-                    if size_report:
-                        json_data = self.client.download_artifact(size_report["url"])
-                        data_point = {
-                            "commit_hash": commit_hash,
-                            "timestamp": pipeline.get("created_at", pipeline.get("updated_at")),
-                            "total_size_mb": json_data["total_size_mb"],
-                            "packages": json_data["packages"]
-                        }
-                        data_points.append(data_point)
-                        self.logger.info(
-                            f"Processed pipeline {pipeline['id']}: "
-                            f"commit {commit_hash[:7]}, "
-                            f"size {json_data['total_size_mb']:.2f}MB"
-                        )
-                    else:
-                        self.logger.debug(f"No pip-sizes.json artifact found for job {size_job['job_number']}")
-                else:
-                    self.logger.debug(f"No measure_pip_sizes job found for pipeline {pipeline['id']}")
-            except Exception as e:
-                self.logger.error(f"Error processing pipeline {pipeline['id']}: {str(e)}")
-                continue
+            # Filter out None results
+            data_points = [r for r in results if r is not None]
 
 
         return data_points
         return data_points
 
 
-    def generate_report(self, data: List[Dict], output_dir: str = "reports"):
+    def generate_report(self, data: List[Dict], output_dir: str = "reports") -> Optional[str]:
         self.logger.info("Generating report...")
         self.logger.info("Generating report...")
         if not data:
         if not data:
             self.logger.error("No data to generate report from!")
             self.logger.error("No data to generate report from!")
@@ -171,18 +175,49 @@ class PackageSizeTracker:
         df = pd.DataFrame(data)
         df = pd.DataFrame(data)
         df['timestamp'] = pd.to_datetime(df['timestamp'])
         df['timestamp'] = pd.to_datetime(df['timestamp'])
         df = df.sort_values('timestamp')
         df = df.sort_values('timestamp')
+        # commit_url is already in the data from process_pipeline
 
 
-        # Create trend plot
+        # Create trend plot with updated styling
         fig = px.line(
         fig = px.line(
             df,
             df,
             x='timestamp',
             x='timestamp',
             y='total_size_mb',
             y='total_size_mb',
-            title='Package Size Trend'
+            title='Package Size Trend',
+            markers=True,
+            hover_data={'commit_hash': True, 'timestamp': True, 'total_size_mb': ':.2f'},
+            custom_data=['commit_hash', 'commit_url']
         )
         )
         fig.update_layout(
         fig.update_layout(
             xaxis_title="Date",
             xaxis_title="Date",
             yaxis_title="Total Size (MB)",
             yaxis_title="Total Size (MB)",
-            hovermode='x unified'
+            hovermode='x unified',
+            plot_bgcolor='white',
+            paper_bgcolor='white',
+            font=dict(size=12),
+            title_x=0.5,
+        )
+        fig.update_traces(
+            line=dict(width=2),
+            marker=dict(size=8),
+            hovertemplate="<br>".join([
+                "Commit: %{customdata[0]}",
+                "Size: %{y:.2f}MB",
+                "Date: %{x}",
+                "<extra>Click to view commit</extra>"
+            ])
+        )
+
+        # Add JavaScript for click handling
+        fig.update_layout(
+            clickmode='event',
+            annotations=[
+                dict(
+                    text="Click any point to view the commit on GitHub",
+                    xref="paper", yref="paper",
+                    x=0, y=1.05,
+                    showarrow=False
+                )
+            ]
         )
         )
 
 
         # Ensure output directory exists
         # Ensure output directory exists
@@ -191,14 +226,25 @@ class PackageSizeTracker:
 
 
         # Save plot
         # Save plot
         plot_path = output_dir / "package_size_trend.html"
         plot_path = output_dir / "package_size_trend.html"
-        fig.write_html(str(plot_path))
+        fig.write_html(
+            str(plot_path),
+            include_plotlyjs=True,
+            full_html=True,
+            post_script="""
+            const plot = document.getElementsByClassName('plotly-graph-div')[0];
+            plot.on('plotly_click', function(data) {
+                const point = data.points[0];
+                const commitUrl = point.customdata[1];
+                window.open(commitUrl, '_blank');
+            });
+            """
+        )
 
 
         # Generate summary
         # Generate summary
         latest = df.iloc[-1]
         latest = df.iloc[-1]
         previous = df.iloc[-2] if len(df) > 1 else latest
         previous = df.iloc[-2] if len(df) > 1 else latest
         size_change = latest['total_size_mb'] - previous['total_size_mb']
         size_change = latest['total_size_mb'] - previous['total_size_mb']
 
 
-        # Save latest data
         latest_data = {
         latest_data = {
             'timestamp': latest['timestamp'].isoformat(),
             'timestamp': latest['timestamp'].isoformat(),
             'commit_hash': latest['commit_hash'],
             'commit_hash': latest['commit_hash'],
@@ -210,9 +256,7 @@ class PackageSizeTracker:
         with open(output_dir / 'latest_data.json', 'w') as f:
         with open(output_dir / 'latest_data.json', 'w') as f:
             json.dump(latest_data, f, indent=2)
             json.dump(latest_data, f, indent=2)
 
 
-        # Print summary to console
         self._print_summary(latest_data)
         self._print_summary(latest_data)
-
         self.logger.info(f"Report generated in {output_dir}")
         self.logger.info(f"Report generated in {output_dir}")
         return str(plot_path)
         return str(plot_path)
 
 
@@ -232,8 +276,7 @@ class PackageSizeTracker:
             print(f"- {pkg['name']}: {pkg['size_mb']:.2f}MB")
             print(f"- {pkg['name']}: {pkg['size_mb']:.2f}MB")
         print("\n")
         print("\n")
 
 
-def main():
-    # Get configuration
+async def main():
     token = os.getenv("CIRCLECI_TOKEN")
     token = os.getenv("CIRCLECI_TOKEN")
     project_slug = os.getenv("CIRCLECI_PROJECT_SLUG")
     project_slug = os.getenv("CIRCLECI_PROJECT_SLUG")
     debug = os.getenv("DEBUG", "").lower() in ("true", "1", "yes")
     debug = os.getenv("DEBUG", "").lower() in ("true", "1", "yes")
@@ -242,17 +285,14 @@ def main():
         print("Error: Please set CIRCLECI_TOKEN and CIRCLECI_PROJECT_SLUG environment variables")
         print("Error: Please set CIRCLECI_TOKEN and CIRCLECI_PROJECT_SLUG environment variables")
         return
         return
 
 
-    # Initialize tracker
     tracker = PackageSizeTracker(token, project_slug, debug)
     tracker = PackageSizeTracker(token, project_slug, debug)
 
 
     try:
     try:
-        # Collect data
-        data = tracker.collect_data()
+        data = await tracker.collect_data()
         if not data:
         if not data:
             print("No data found!")
             print("No data found!")
             return
             return
 
 
-        # Generate report
         report_path = tracker.generate_report(data)
         report_path = tracker.generate_report(data)
         if report_path:
         if report_path:
             print(f"\nDetailed report available at: {report_path}")
             print(f"\nDetailed report available at: {report_path}")
@@ -263,4 +303,4 @@ def main():
             raise
             raise
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    main()
+    asyncio.run(main())