From 6d9059426ca413e17025a2ed36f93252e63fc57a Mon Sep 17 00:00:00 2001 From: Justin Date: Fri, 27 Feb 2026 01:13:03 -0600 Subject: [PATCH 1/2] Initial coursebook scraper retry setup --- main.go | 18 +++++++--- scrapers/coursebook.go | 78 +++++++++++++++++++++++++++++------------- utils/methods.go | 20 +++++------ utils/utils_test.go | 7 ++-- 4 files changed, 82 insertions(+), 41 deletions(-) diff --git a/main.go b/main.go index 711d033..ab62c4e 100644 --- a/main.go +++ b/main.go @@ -1,6 +1,7 @@ package main import ( + "errors" "flag" "fmt" "log" @@ -16,7 +17,7 @@ import ( func main() { // Load environment variables - godotenv.Load() + godotenv.Load() // TODO: I Don't think this does anything // Setup flags @@ -85,8 +86,8 @@ func main() { } defer logFile.Close() - // Set logging output destination to a SplitWriter that writes to both the log file and stdout - log.SetOutput(utils.NewSplitWriter(logFile, os.Stdout)) + // Set logging output destination to a SplitWriter that writes to both the log file and stderr + log.SetOutput(utils.NewSplitWriter(logFile, os.Stdout)) // TODO: Switch to stderr // Do verbose logging if verbose flag specified if *verbose { log.SetFlags(log.Ltime | log.Lmicroseconds | log.Lshortfile | utils.Lverbose) @@ -103,9 +104,16 @@ func main() { scrapers.ScrapeProfiles(*outDir) case *scrapeCoursebook: if *term == "" { - log.Panic("No term specified for coursebook scraping! Use -term to specify.") + log.Fatalf("No term specified for coursebook scraping! Use -term to specify.") } - scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir, *resume) + // TODO Handle Errors, give flag to retry on panic, and a retry count for non panics, how long to wait for retry + err := scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir, *resume) + if errors.Is(err, &scrapers.SetupError{}) { + log.Fatalf("Coursebook Scraping Setup Failed: %v", err) + } else if err != nil { + log.Fatalf("Coursebook Scraping Failed: %v", err) + } + case *scrapeDiscounts: scrapers.ScrapeDiscounts(*outDir) case *cometCalendar: diff --git a/scrapers/coursebook.go b/scrapers/coursebook.go index 4f6119c..76deb15 100644 --- a/scrapers/coursebook.go +++ b/scrapers/coursebook.go @@ -35,21 +35,36 @@ const ( httpTimeout = 10 * time.Second ) +type SetupError struct { + Message string +} + +func (e *SetupError) Error() string { + return fmt.Sprintf("%s", e.Message) +} + // ScrapeCoursebook scrapes utd coursebook for the provided term (semester) -func ScrapeCoursebook(term string, startPrefix string, outDir string, resume bool) { +func ScrapeCoursebook(term string, startPrefix string, outDir string, resume bool) error { if startPrefix != "" && !prefixRegex.MatchString(startPrefix) { - log.Fatalf("Invalid starting prefix %s, must match format cp_{abcde}", startPrefix) + return &SetupError{Message: fmt.Sprintf("invalid starting prefix %s, must match format cp_{abcde}", startPrefix)} } if !termRegex.MatchString(term) { - log.Fatalf("Invalid term %s, must match format {00-99}{s/f/u}", term) + return &SetupError{Message: fmt.Sprintf("invalid term %s, must match format {00-99}{s/f/u}", term)} } - scraper := newCoursebookScraper(term, outDir) + scraper, err := newCoursebookScraper(term, outDir) + if err != nil { + return err + } defer scraper.chromedpCancel() if resume && startPrefix == "" { // providing a starting prefix overrides the resume flag - startPrefix = scraper.lastCompletePrefix() + var err error + startPrefix, err = scraper.lastCompletePrefix() + if err != nil { + return &SetupError{Message: fmt.Sprintf("failed to get last complete prefix while resuming: %v", err)} + } } log.Printf("[Begin Scrape] Starting scrape for term %s with %d prefixes", term, len(scraper.prefixes)) @@ -62,7 +77,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo start := time.Now() if err := scraper.ensurePrefixFolder(prefix); err != nil { - log.Fatal(err) + log.Panic(err) } var sectionIds []string @@ -76,7 +91,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo } if err != nil { - log.Fatalf("Error getting section ids for %s ", prefix) + log.Panicf("Error getting section ids for %s ", prefix) } if len(sectionIds) == 0 { @@ -89,10 +104,10 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo for _, sectionId := range sectionIds { content, err := scraper.getSectionContent(sectionId) if err != nil { - log.Fatalf("Error getting section content for section %s: %v", sectionId, err) + log.Panicf("Error getting section content for section %s: %v", sectionId, err) } if err := scraper.writeSection(prefix, sectionId, content); err != nil { - log.Fatalf("Error writing section %s: %v", sectionId, err) + log.Panicf("Error writing section %s: %v", sectionId, err) } time.Sleep(reqThrottle) } @@ -104,8 +119,10 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo log.Printf("[Scrape Complete] Finished scraping term %s in %v. Total sections %d: Total retries %d", term, time.Since(totalTime), scraper.totalScrapedSections, scraper.reqRetries) if err := scraper.validate(); err != nil { - log.Fatal("Validating failed: ", err) + log.Panicf("Validating failed: %v", err) } + + return nil } type coursebookScraper struct { @@ -124,38 +141,45 @@ type coursebookScraper struct { totalScrapedSections int } -func newCoursebookScraper(term string, outDir string) *coursebookScraper { +func newCoursebookScraper(term string, outDir string) (*coursebookScraper, error) { ctx, cancel := utils.InitChromeDp() httpClient := &http.Client{ Timeout: httpTimeout, } //prefixes in alphabetical order for skip prefix flag - prefixes := utils.GetCoursePrefixes(ctx) + prefixes, err := utils.GetCoursePrefixes(ctx) + if err != nil { + return nil, err + } sort.Strings(prefixes) + coursebookHeaders, err := utils.RefreshToken(ctx) + if err != nil { + return nil, err + } return &coursebookScraper{ chromedpCtx: ctx, chromedpCancel: cancel, httpClient: httpClient, prefixes: prefixes, - coursebookHeaders: utils.RefreshToken(ctx), + coursebookHeaders: coursebookHeaders, term: term, outDir: outDir, prefixIdsCache: make(map[string][]string), - } + }, nil } // lastCompletePrefix returns the last prefix (alphabetical order) that contains // html files for all of its section ids. returns an empty string if there are no // complete prefixes -func (s *coursebookScraper) lastCompletePrefix() string { +func (s *coursebookScraper) lastCompletePrefix() (string, error) { if err := s.ensureOutputFolder(); err != nil { - log.Fatal(err) + return "", err } dir, err := os.ReadDir(filepath.Join(s.outDir, s.term)) if err != nil { - log.Fatalf("failed to read output directory: %v", err) + return "", fmt.Errorf("failed to read output directory: %w", err) } foundPrefixes := make([]string, 0, len(s.prefixes)) @@ -169,14 +193,14 @@ func (s *coursebookScraper) lastCompletePrefix() string { for _, prefix := range foundPrefixes { missing, err := s.getMissingIdsForPrefix(prefix) if err != nil { - log.Fatalf("Failed to get ids: %v", err) + return "", fmt.Errorf("failed to get ids: %w", err) } if len(missing) == 0 { - return prefix + return prefix, nil } time.Sleep(reqThrottle) } - return "" + return "", nil } // ensurePrefixFolder creates {outDir}/term if it does not exist @@ -235,7 +259,7 @@ func (s *coursebookScraper) getMissingIdsForPrefix(prefix string) ([]string, err dir, err := os.ReadDir(path) if err != nil { - log.Panicf("Failed to access folder %s: %v", path, err) + return sectionIds, fmt.Errorf("failed to access folder %s: %w", path, err) } foundIds := make(map[string]bool) @@ -285,7 +309,7 @@ func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (s err := utils.Retry(func() error { req, err := http.NewRequest("POST", "https://coursebook.utdallas.edu/clips/clip-cb11-hat.zog", strings.NewReader(queryStr)) if err != nil { - log.Fatalf("Http request failed: %v", err) + return fmt.Errorf("http request failed: %w", err) } req.Header = s.coursebookHeaders @@ -310,7 +334,13 @@ func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (s return err }, retries, func(numRetries int) { utils.VPrintf("[Request Retry] Attempt %d of %d for request %s", numRetries, retries, reqName) - s.coursebookHeaders = utils.RefreshToken(s.chromedpCtx) + coursebookHeaders, err := utils.RefreshToken(s.chromedpCtx) + if err != nil { + // TODO: Since this is in a retry, perhaps we should implement this differently + utils.VPrintf("[Token Refresh Failed] Failed to refresh token during retry for request %s: %v", reqName, err) + } + s.coursebookHeaders = coursebookHeaders + s.reqRetries++ //back off exponentially @@ -345,7 +375,7 @@ func (s *coursebookScraper) validate() error { log.Printf("[Validation] Missing %d sections for %s", len(ids), prefix) if err := s.ensurePrefixFolder(prefix); err != nil { - log.Fatal(err) + log.Panic(err) } for _, id := range ids { diff --git a/utils/methods.go b/utils/methods.go index 90712e2..1afc710 100644 --- a/utils/methods.go +++ b/utils/methods.go @@ -54,14 +54,14 @@ func InitChromeDp() (chromedpCtx context.Context, cancelFnc context.CancelFunc) } // RefreshToken logs into CourseBook and returns headers containing a fresh session token. -func RefreshToken(chromedpCtx context.Context) map[string][]string { +func RefreshToken(chromedpCtx context.Context) (map[string][]string, error) { netID, err := GetEnv("LOGIN_NETID") if err != nil { - panic(err) + return nil, err } password, err := GetEnv("LOGIN_PASSWORD") if err != nil { - panic(err) + return nil, err } delayedRetryCallback := func(numRetries int) { @@ -81,13 +81,13 @@ func RefreshToken(chromedpCtx context.Context) map[string][]string { chromedp.Click(`button#login-button`), ) if r != nil && r.Status != 200 { - return errors.New("Non-200 response status code") + return fmt.Errorf("non-200 response status code: %d", r.Status) } return err }, 3, delayedRetryCallback) if err != nil { - panic(err) + return nil, err // TODO: we should return different error or error types based on the response code } time.Sleep(250 * time.Millisecond) @@ -124,7 +124,7 @@ func RefreshToken(chromedpCtx context.Context) map[string][]string { }, 3, delayedRetryCallback) if err != nil { - panic(err) + return nil, err } return map[string][]string{ @@ -135,7 +135,7 @@ func RefreshToken(chromedpCtx context.Context) map[string][]string { "Content-Type": {"application/x-www-form-urlencoded"}, "Cookie": cookieStrs, "Connection": {"keep-alive"}, - } + }, nil } // RefreshAstraToken signs into Astra and returns headers containing authentication cookies. @@ -288,7 +288,7 @@ func Retry(action func() error, maxRetries int, retryCallback func(numRetries in } // GetCoursePrefixes retrieves all course prefix values from CourseBook. -func GetCoursePrefixes(chromedpCtx context.Context) []string { +func GetCoursePrefixes(chromedpCtx context.Context) ([]string, error) { // Might need to refresh the token every time we get new course prefixes in the future // refreshToken(chromedpCtx) @@ -308,10 +308,10 @@ func GetCoursePrefixes(chromedpCtx context.Context) []string { ), ) if err != nil { - log.Panic(err) + return nil, err } log.Printf("Found the %d course prefixes!", len(coursePrefixes)) - return coursePrefixes + return coursePrefixes, nil } // ConvertFromInterface attempts to convert a value into the requested type and returns a pointer when successful. diff --git a/utils/utils_test.go b/utils/utils_test.go index aeb5ba2..a93c48b 100644 --- a/utils/utils_test.go +++ b/utils/utils_test.go @@ -40,7 +40,10 @@ func TestRefreshToken(t *testing.T) { ctx, cancel := InitChromeDp() defer cancel() // Try refreshing token - headers := RefreshToken(ctx) + headers, err := RefreshToken(ctx) + if err != nil { + t.Errorf("Failed to refresh token: %v", err) + } // Make sure we successfully got a PTGSESSID cookie for _, cookie := range headers["Cookie"] { if strings.HasPrefix(cookie, "PTGSESSID") { @@ -48,5 +51,5 @@ func TestRefreshToken(t *testing.T) { } } // Fail if no PTGSESSID cookie found - t.Fatalf("Failed to get PTGSESSID cookie from RefreshToken!") + t.Errorf("Failed to get PTGSESSID cookie from RefreshToken!") } From 58d66ca6f21482040d2a651b7a61da110423ac4e Mon Sep 17 00:00:00 2001 From: Justin Date: Fri, 27 Feb 2026 01:25:30 -0600 Subject: [PATCH 2/2] staticcheck fix - arg is already a string --- scrapers/coursebook.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapers/coursebook.go b/scrapers/coursebook.go index 76deb15..a3b62ce 100644 --- a/scrapers/coursebook.go +++ b/scrapers/coursebook.go @@ -40,7 +40,7 @@ type SetupError struct { } func (e *SetupError) Error() string { - return fmt.Sprintf("%s", e.Message) + return e.Message } // ScrapeCoursebook scrapes utd coursebook for the provided term (semester) @@ -337,10 +337,10 @@ func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (s coursebookHeaders, err := utils.RefreshToken(s.chromedpCtx) if err != nil { // TODO: Since this is in a retry, perhaps we should implement this differently - utils.VPrintf("[Token Refresh Failed] Failed to refresh token during retry for request %s: %v", reqName, err) + utils.VPrintf("[Token Refresh Failed] Failed to refresh token during retry for request %s: %v", reqName, err) } s.coursebookHeaders = coursebookHeaders - + s.reqRetries++ //back off exponentially