Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions main.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package main

import (
"errors"
"flag"
"fmt"
"log"
Expand All @@ -16,7 +17,7 @@ import (

func main() {
// Load environment variables
godotenv.Load()
godotenv.Load() // TODO: I Don't think this does anything

// Setup flags

Expand Down Expand Up @@ -85,8 +86,8 @@ func main() {
}

defer logFile.Close()
// Set logging output destination to a SplitWriter that writes to both the log file and stdout
log.SetOutput(utils.NewSplitWriter(logFile, os.Stdout))
// Set logging output destination to a SplitWriter that writes to both the log file and stderr
log.SetOutput(utils.NewSplitWriter(logFile, os.Stdout)) // TODO: Switch to stderr
// Do verbose logging if verbose flag specified
if *verbose {
log.SetFlags(log.Ltime | log.Lmicroseconds | log.Lshortfile | utils.Lverbose)
Expand All @@ -103,9 +104,16 @@ func main() {
scrapers.ScrapeProfiles(*outDir)
case *scrapeCoursebook:
if *term == "" {
log.Panic("No term specified for coursebook scraping! Use -term to specify.")
log.Fatalf("No term specified for coursebook scraping! Use -term to specify.")
}
scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir, *resume)
// TODO Handle Errors, give flag to retry on panic, and a retry count for non panics, how long to wait for retry
err := scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir, *resume)
if errors.Is(err, &scrapers.SetupError{}) {
log.Fatalf("Coursebook Scraping Setup Failed: %v", err)
} else if err != nil {
log.Fatalf("Coursebook Scraping Failed: %v", err)
}

case *scrapeDiscounts:
scrapers.ScrapeDiscounts(*outDir)
case *cometCalendar:
Expand Down
78 changes: 54 additions & 24 deletions scrapers/coursebook.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,36 @@ const (
httpTimeout = 10 * time.Second
)

type SetupError struct {
Message string
}

func (e *SetupError) Error() string {
return e.Message
}

// ScrapeCoursebook scrapes utd coursebook for the provided term (semester)
func ScrapeCoursebook(term string, startPrefix string, outDir string, resume bool) {
func ScrapeCoursebook(term string, startPrefix string, outDir string, resume bool) error {
if startPrefix != "" && !prefixRegex.MatchString(startPrefix) {
log.Fatalf("Invalid starting prefix %s, must match format cp_{abcde}", startPrefix)
return &SetupError{Message: fmt.Sprintf("invalid starting prefix %s, must match format cp_{abcde}", startPrefix)}
}
if !termRegex.MatchString(term) {
log.Fatalf("Invalid term %s, must match format {00-99}{s/f/u}", term)
return &SetupError{Message: fmt.Sprintf("invalid term %s, must match format {00-99}{s/f/u}", term)}
}

scraper := newCoursebookScraper(term, outDir)
scraper, err := newCoursebookScraper(term, outDir)
if err != nil {
return err
}
defer scraper.chromedpCancel()

if resume && startPrefix == "" {
// providing a starting prefix overrides the resume flag
startPrefix = scraper.lastCompletePrefix()
var err error
startPrefix, err = scraper.lastCompletePrefix()
if err != nil {
return &SetupError{Message: fmt.Sprintf("failed to get last complete prefix while resuming: %v", err)}
}
}

log.Printf("[Begin Scrape] Starting scrape for term %s with %d prefixes", term, len(scraper.prefixes))
Expand All @@ -62,7 +77,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo

start := time.Now()
if err := scraper.ensurePrefixFolder(prefix); err != nil {
log.Fatal(err)
log.Panic(err)
}

var sectionIds []string
Expand All @@ -76,7 +91,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo
}

if err != nil {
log.Fatalf("Error getting section ids for %s ", prefix)
log.Panicf("Error getting section ids for %s ", prefix)
}

if len(sectionIds) == 0 {
Expand All @@ -89,10 +104,10 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo
for _, sectionId := range sectionIds {
content, err := scraper.getSectionContent(sectionId)
if err != nil {
log.Fatalf("Error getting section content for section %s: %v", sectionId, err)
log.Panicf("Error getting section content for section %s: %v", sectionId, err)
}
if err := scraper.writeSection(prefix, sectionId, content); err != nil {
log.Fatalf("Error writing section %s: %v", sectionId, err)
log.Panicf("Error writing section %s: %v", sectionId, err)
}
time.Sleep(reqThrottle)
}
Expand All @@ -104,8 +119,10 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo
log.Printf("[Scrape Complete] Finished scraping term %s in %v. Total sections %d: Total retries %d", term, time.Since(totalTime), scraper.totalScrapedSections, scraper.reqRetries)

if err := scraper.validate(); err != nil {
log.Fatal("Validating failed: ", err)
log.Panicf("Validating failed: %v", err)
}

return nil
}

type coursebookScraper struct {
Expand All @@ -124,38 +141,45 @@ type coursebookScraper struct {
totalScrapedSections int
}

func newCoursebookScraper(term string, outDir string) *coursebookScraper {
func newCoursebookScraper(term string, outDir string) (*coursebookScraper, error) {
ctx, cancel := utils.InitChromeDp()
httpClient := &http.Client{
Timeout: httpTimeout,
}

//prefixes in alphabetical order for skip prefix flag
prefixes := utils.GetCoursePrefixes(ctx)
prefixes, err := utils.GetCoursePrefixes(ctx)
if err != nil {
return nil, err
}
sort.Strings(prefixes)
coursebookHeaders, err := utils.RefreshToken(ctx)
if err != nil {
return nil, err
}
return &coursebookScraper{
chromedpCtx: ctx,
chromedpCancel: cancel,
httpClient: httpClient,
prefixes: prefixes,
coursebookHeaders: utils.RefreshToken(ctx),
coursebookHeaders: coursebookHeaders,
term: term,
outDir: outDir,
prefixIdsCache: make(map[string][]string),
}
}, nil
}

// lastCompletePrefix returns the last prefix (alphabetical order) that contains
// html files for all of its section ids. returns an empty string if there are no
// complete prefixes
func (s *coursebookScraper) lastCompletePrefix() string {
func (s *coursebookScraper) lastCompletePrefix() (string, error) {
if err := s.ensureOutputFolder(); err != nil {
log.Fatal(err)
return "", err
}

dir, err := os.ReadDir(filepath.Join(s.outDir, s.term))
if err != nil {
log.Fatalf("failed to read output directory: %v", err)
return "", fmt.Errorf("failed to read output directory: %w", err)
}

foundPrefixes := make([]string, 0, len(s.prefixes))
Expand All @@ -169,14 +193,14 @@ func (s *coursebookScraper) lastCompletePrefix() string {
for _, prefix := range foundPrefixes {
missing, err := s.getMissingIdsForPrefix(prefix)
if err != nil {
log.Fatalf("Failed to get ids: %v", err)
return "", fmt.Errorf("failed to get ids: %w", err)
}
if len(missing) == 0 {
return prefix
return prefix, nil
}
time.Sleep(reqThrottle)
}
return ""
return "", nil
}

// ensurePrefixFolder creates {outDir}/term if it does not exist
Expand Down Expand Up @@ -235,7 +259,7 @@ func (s *coursebookScraper) getMissingIdsForPrefix(prefix string) ([]string, err

dir, err := os.ReadDir(path)
if err != nil {
log.Panicf("Failed to access folder %s: %v", path, err)
return sectionIds, fmt.Errorf("failed to access folder %s: %w", path, err)
}

foundIds := make(map[string]bool)
Expand Down Expand Up @@ -285,7 +309,7 @@ func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (s
err := utils.Retry(func() error {
req, err := http.NewRequest("POST", "https://coursebook.utdallas.edu/clips/clip-cb11-hat.zog", strings.NewReader(queryStr))
if err != nil {
log.Fatalf("Http request failed: %v", err)
return fmt.Errorf("http request failed: %w", err)
}
req.Header = s.coursebookHeaders

Expand All @@ -310,7 +334,13 @@ func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (s
return err
}, retries, func(numRetries int) {
utils.VPrintf("[Request Retry] Attempt %d of %d for request %s", numRetries, retries, reqName)
s.coursebookHeaders = utils.RefreshToken(s.chromedpCtx)
coursebookHeaders, err := utils.RefreshToken(s.chromedpCtx)
if err != nil {
// TODO: Since this is in a retry, perhaps we should implement this differently
utils.VPrintf("[Token Refresh Failed] Failed to refresh token during retry for request %s: %v", reqName, err)
}
s.coursebookHeaders = coursebookHeaders

s.reqRetries++

//back off exponentially
Expand Down Expand Up @@ -345,7 +375,7 @@ func (s *coursebookScraper) validate() error {
log.Printf("[Validation] Missing %d sections for %s", len(ids), prefix)

if err := s.ensurePrefixFolder(prefix); err != nil {
log.Fatal(err)
log.Panic(err)
}

for _, id := range ids {
Expand Down
20 changes: 10 additions & 10 deletions utils/methods.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,14 @@ func InitChromeDp() (chromedpCtx context.Context, cancelFnc context.CancelFunc)
}

// RefreshToken logs into CourseBook and returns headers containing a fresh session token.
func RefreshToken(chromedpCtx context.Context) map[string][]string {
func RefreshToken(chromedpCtx context.Context) (map[string][]string, error) {
netID, err := GetEnv("LOGIN_NETID")
if err != nil {
panic(err)
return nil, err
}
password, err := GetEnv("LOGIN_PASSWORD")
if err != nil {
panic(err)
return nil, err
}

delayedRetryCallback := func(numRetries int) {
Expand All @@ -81,13 +81,13 @@ func RefreshToken(chromedpCtx context.Context) map[string][]string {
chromedp.Click(`button#login-button`),
)
if r != nil && r.Status != 200 {
return errors.New("Non-200 response status code")
return fmt.Errorf("non-200 response status code: %d", r.Status)
}
return err
}, 3, delayedRetryCallback)

if err != nil {
panic(err)
return nil, err // TODO: we should return different error or error types based on the response code
}

time.Sleep(250 * time.Millisecond)
Expand Down Expand Up @@ -124,7 +124,7 @@ func RefreshToken(chromedpCtx context.Context) map[string][]string {
}, 3, delayedRetryCallback)

if err != nil {
panic(err)
return nil, err
}

return map[string][]string{
Expand All @@ -135,7 +135,7 @@ func RefreshToken(chromedpCtx context.Context) map[string][]string {
"Content-Type": {"application/x-www-form-urlencoded"},
"Cookie": cookieStrs,
"Connection": {"keep-alive"},
}
}, nil
}

// RefreshAstraToken signs into Astra and returns headers containing authentication cookies.
Expand Down Expand Up @@ -288,7 +288,7 @@ func Retry(action func() error, maxRetries int, retryCallback func(numRetries in
}

// GetCoursePrefixes retrieves all course prefix values from CourseBook.
func GetCoursePrefixes(chromedpCtx context.Context) []string {
func GetCoursePrefixes(chromedpCtx context.Context) ([]string, error) {
// Might need to refresh the token every time we get new course prefixes in the future
// refreshToken(chromedpCtx)

Expand All @@ -308,10 +308,10 @@ func GetCoursePrefixes(chromedpCtx context.Context) []string {
),
)
if err != nil {
log.Panic(err)
return nil, err
}
log.Printf("Found the %d course prefixes!", len(coursePrefixes))
return coursePrefixes
return coursePrefixes, nil
}

// ConvertFromInterface attempts to convert a value into the requested type and returns a pointer when successful.
Expand Down
7 changes: 5 additions & 2 deletions utils/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,16 @@ func TestRefreshToken(t *testing.T) {
ctx, cancel := InitChromeDp()
defer cancel()
// Try refreshing token
headers := RefreshToken(ctx)
headers, err := RefreshToken(ctx)
if err != nil {
t.Errorf("Failed to refresh token: %v", err)
}
// Make sure we successfully got a PTGSESSID cookie
for _, cookie := range headers["Cookie"] {
if strings.HasPrefix(cookie, "PTGSESSID") {
return
}
}
// Fail if no PTGSESSID cookie found
t.Fatalf("Failed to get PTGSESSID cookie from RefreshToken!")
t.Errorf("Failed to get PTGSESSID cookie from RefreshToken!")
}