Skip to content

Commit

Permalink
Fix callbacks set after the Visit function
Browse files Browse the repository at this point in the history
  • Loading branch information
kinoute committed Sep 15, 2024
1 parent 2a177de commit 48ac802
Show file tree
Hide file tree
Showing 10 changed files with 114 additions and 148 deletions.
142 changes: 69 additions & 73 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"moviestills/websites"
"os"
"os/signal"
"path/filepath"
"reflect"
"strings"
"syscall"
Expand Down Expand Up @@ -43,139 +44,134 @@ func main() {
var options config.Options
arg.MustParse(&options)

// We override the default prefix label for "info" messages to
// align it perfectly on the Terminal with other labels. Otherwise,
// since "INFO" is shorter than the other labels, the width
// of the different labels is not the same.
log.Info = *log.Info.WithPrefix(log.Prefix{Text: " INFOS ", Style: log.Info.Prefix.Style})

// Disable style and colors for output
if options.NoStyle {
log.DisableStyling()
}

// Disable colors only for output
if options.NoColors {
log.DisableColor()
}

// Interface of the app
log.DefaultHeader.Println("Movie Stills", config.VERSION)

// Adjust logging styles
setupLogging(&options)

// Display available scrapers implemented
if options.ListScrapers {
listAvailableScrapers(sites)
return
}

log.DefaultSection.Println("Configuration")
printConfiguration(&options)

// Check presence of website argument
if options.Website == "" {
log.Error.Println("A website must be set through arguments.")
listAvailableScrapers(sites)
os.Exit(1)
}

website := strings.ToLower(options.Website)

// Verify if we have a scrapper for the given website.
// If we do, "site_func" will now contain a function listed in
// the sites map that matches a module for this specific
// website stored in the "websites" folder.
siteFunc, scraperExists := sites[strings.ToLower(options.Website)]
if !scraperExists {
log.Error.Println("We don't have a scraper for this website.")
// List available scrapers
if _, exists := sites[website]; !exists {
log.Error.Println("We don't have a scraper for:", log.White(website))
listAvailableScrapers(sites)
os.Exit(1)
}

// If we're here, it means we have a valid scraper for a valid website!
log.DefaultSection.Println("Configuration")
printConfiguration(&options)

// Create the "cache" directory.
// This folder stores the scraped websites pages.
// If we can't create it, stop right there.
// Create the necessary directories (cache and data)
setupDirectories(&options)

// Create and configure scraper for each website
scraper := colly.NewCollector(
colly.CacheDir(filepath.Join(options.CacheDir, website)),
)

// Set up scraper settings
configureScraper(&scraper, &options)

// Run the scraper for the current website
siteFunc := sites[website]
siteFunc(&scraper, &options)

log.Info.Println("Finished scraping", log.White(website))

}

func clearScreen() {
print("\033[H\033[2J")
}

func handleShutdown() {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM, syscall.SIGINT)
go func() {
<-sigChan
log.Info.Println("Shutting down...")
os.Exit(130)
}()
}

func setupLogging(options *config.Options) {
// Adjust the logging prefix
log.Info = *log.Info.WithPrefix(log.Prefix{Text: " INFOS ", Style: log.Info.Prefix.Style})

// Disable styles and colors based on options
if options.NoStyle {
log.DisableStyling()
}
if options.NoColors {
log.DisableColor()
}
}

func setupDirectories(options *config.Options) {
// Create the cache directory
if _, err := utils.CreateFolder(options.CacheDir); err != nil {
log.Error.Println("The cache directory", log.White(options.CacheDir), "can't be created:", log.Red(err))
os.Exit(1)
}

// Create the "data" directory.
// This folder stores the movie snapshots.
// If we can't create it, stop right there.
// Create the data directory
if _, err := utils.CreateFolder(options.DataDir); err != nil {
log.Error.Println("The data directory", log.White(options.DataDir), "can't be created:", log.Red(err))
os.Exit(1)
}
}

// Instantiate main scraper
scraper := colly.NewCollector(
colly.CacheDir(options.CacheDir),
)

func configureScraper(scraper **colly.Collector, options *config.Options) {
// Set up a proxy
if options.Proxy != "" {
if err := scraper.SetProxy(options.Proxy); err != nil {
if err := (*scraper).SetProxy(options.Proxy); err != nil {
log.Error.Println("Could not set proxy", log.White(options.Proxy), log.Red(err))
}
}

// Set request timeout
scraper.SetRequestTimeout(options.TimeOut)
(*scraper).SetRequestTimeout(options.TimeOut)

// Enable asynchronous jobs if asked
if options.Async {
scraper.Async = true
(*scraper).Async = true
}

// Enable Debugging level if asked through the CLI
if options.Debug {
log.EnableDebugMessages()
scraper.SetDebugger(&debug.PTermDebugger{})
(*scraper).SetDebugger(&debug.PTermDebugger{})
}

// Use random user agent and referer to avoid getting banned
extensions.RandomUserAgent(scraper)
extensions.Referer(scraper)
extensions.RandomUserAgent((*scraper))
extensions.Referer((*scraper))

// Limit parallelism and add random delay to avoid getting IP banned
if err := scraper.Limit(&colly.LimitRule{
if err := (*scraper).Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: options.Parallel,
RandomDelay: 1 * options.RandomDelay,
}); err != nil {
log.Warning.Println("Can't change scraper limit options:", log.Red(err))
}

log.DefaultSection.Println("Scraping")
log.Info.Println("Starting", options.Website, "Scraper")

// Here we call the website module depending on the website provided
// in the CLI by the user.
// This will call a file/module/func made specifically to scrap this website.
// All available scrapers are stored in the "websites" folder.
siteFunc(&scraper, &options)

log.Info.Println("Finished Scraping", options.Website, "!")

}

// Clear Terminal Screen
func clearScreen() {
print("\033[H\033[2J")
}

// Signal handling function
func handleShutdown() {
sigChan := make(chan os.Signal, 1)

// Listen for SIGINT (Ctrl+C) and SIGTERM (termination)
signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM, syscall.SIGINT)

go func() {
<-sigChan
log.Info.Println("Shutting down...")
os.Exit(130)
}()
}

// Print configuration as a bullet list. Most
Expand Down
13 changes: 5 additions & 8 deletions websites/blubeaver.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,6 @@ func BluBeaverScraper(scraper **colly.Collector, options *config.Options) {
movieScraper := (*scraper).Clone()
movieScraper.AllowURLRevisit = false

if err := (*scraper).Visit(BluBeaverURL); err != nil {
log.Error.Println("Can't visit index page", log.White(BluBeaverURL), ":", log.Red(err))
}

// Print error just in case
(*scraper).OnError(func(r *colly.Response, err error) {
log.Error.Println(r.Request.URL, "\t", log.White(r.StatusCode), "\nError:", log.Red(err))
Expand Down Expand Up @@ -187,10 +183,11 @@ func BluBeaverScraper(scraper **colly.Collector, options *config.Options) {

})

// Ensure that all requests are completed before exiting
if (*scraper).Async {
(*scraper).Wait()
movieScraper.Wait()
if err := (*scraper).Visit(BluBeaverURL); err != nil {
log.Error.Println("Can't visit index page", log.White(BluBeaverURL), ":", log.Red(err))
}

// Ensure that all requests are completed before exiting
(*scraper).Wait()
movieScraper.Wait()
}
13 changes: 5 additions & 8 deletions websites/blusscreens.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,6 @@ func BlusScraper(scraper **colly.Collector, options *config.Options) {
movieScraper := (*scraper).Clone()
movieScraper.AllowURLRevisit = false

if err := (*scraper).Visit(BlusURL); err != nil {
log.Error.Println("Can't visit index page", log.White(BlusURL), log.Red(err))
}

// Print error just in case
(*scraper).OnError(func(r *colly.Response, err error) {
log.Error.Println(r.Request.URL, "\t", log.White(r.StatusCode), "\nError:", log.Red(err))
Expand Down Expand Up @@ -242,12 +238,13 @@ func BlusScraper(scraper **colly.Collector, options *config.Options) {
}
})

// Ensure that all requests are completed before exiting
if (*scraper).Async {
(*scraper).Wait()
movieScraper.Wait()
if err := (*scraper).Visit(BlusURL); err != nil {
log.Error.Println("Can't visit index page", log.White(BlusURL), log.Red(err))
}

// Ensure that all requests are completed before exiting
(*scraper).Wait()
movieScraper.Wait()
}

// Save summary of scrapped movies to a JSON file
Expand Down
16 changes: 5 additions & 11 deletions websites/dvdbeaver.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,6 @@ func DVDBeaverScraper(scraper **colly.Collector, options *config.Options) {
movieScraper := (*scraper).Clone()
movieScraper.AllowURLRevisit = false

if err := (*scraper).Visit(BeaverURL); err != nil {
log.Error.Println("Can't visit index page:", log.Red(err))
}

// Print error just in case
(*scraper).OnError(func(r *colly.Response, err error) {
log.Error.Println(r.Request.URL, "\t", log.White(r.StatusCode), "\nError:", log.Red(err))
Expand All @@ -71,9 +67,6 @@ func DVDBeaverScraper(scraper **colly.Collector, options *config.Options) {
if err := movieListScraper.Visit(movieListURL); err != nil {
log.Error.Println("Can't visit movie list page", log.White(movieListURL), log.Red(err))
}

// In case we enabled asynchronous jobs
movieListScraper.Wait()
})

// Before making a request print "Visiting ..."
Expand Down Expand Up @@ -207,10 +200,11 @@ func DVDBeaverScraper(scraper **colly.Collector, options *config.Options) {
}
})

// Ensure that all requests are completed before exiting
if (*scraper).Async {
(*scraper).Wait()
movieScraper.Wait()
if err := (*scraper).Visit(BeaverURL); err != nil {
log.Error.Println("Can't visit index page:", log.Red(err))
}

// Ensure that all requests are completed before exiting
(*scraper).Wait()
movieScraper.Wait()
}
13 changes: 5 additions & 8 deletions websites/evanerichards.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,6 @@ func EvanERichardsScraper(scraper **colly.Collector, options *config.Options) {
movieScraper := (*scraper).Clone()
movieScraper.AllowURLRevisit = false

if err := (*scraper).Visit(EvanERichardsURL); err != nil {
log.Error.Println("Can't visit index page", log.White(BluBeaverURL), ":", log.Red(err))
}

// Print error just in case
(*scraper).OnError(func(r *colly.Response, err error) {
log.Error.Println(r.Request.URL, "\t", log.White(r.StatusCode), "\nError:", log.Red(err))
Expand Down Expand Up @@ -128,10 +124,11 @@ func EvanERichardsScraper(scraper **colly.Collector, options *config.Options) {
}
})

// Ensure that all requests are completed before exiting
if (*scraper).Async {
(*scraper).Wait()
movieScraper.Wait()
if err := (*scraper).Visit(EvanERichardsURL); err != nil {
log.Error.Println("Can't visit index page", log.White(BluBeaverURL), ":", log.Red(err))
}

// Ensure that all requests are completed before exiting
(*scraper).Wait()
movieScraper.Wait()
}
13 changes: 5 additions & 8 deletions websites/film-grab.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,6 @@ func FilmGrabScraper(scraper **colly.Collector, options *config.Options) {
movieScraper := (*scraper).Clone()
movieScraper.AllowURLRevisit = false

if err := (*scraper).Visit(FilmGrabURL); err != nil {
log.Error.Println("Can't visit index page", log.White(FilmGrabURL), ":", log.Red(err))
}

// Print error just in case
(*scraper).OnError(func(r *colly.Response, err error) {
log.Error.Println(r.Request.URL, "\t", log.White(r.StatusCode), "\nError:", log.Red(err))
Expand Down Expand Up @@ -118,10 +114,11 @@ func FilmGrabScraper(scraper **colly.Collector, options *config.Options) {

})

// Ensure that all requests are completed before exiting
if (*scraper).Async {
(*scraper).Wait()
movieScraper.Wait()
if err := (*scraper).Visit(FilmGrabURL); err != nil {
log.Error.Println("Can't visit index page", log.White(FilmGrabURL), ":", log.Red(err))
}

// Ensure that all requests are completed before exiting
(*scraper).Wait()
movieScraper.Wait()
}
13 changes: 5 additions & 8 deletions websites/highdefdiscnews.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,6 @@ func HighDefDiscNewsScraper(scraper **colly.Collector, options *config.Options)
movieScraper := (*scraper).Clone()
movieScraper.AllowURLRevisit = false

if err := (*scraper).Visit(HighDefDiscNewsURL); err != nil {
log.Error.Println("Can't visit index page", log.White(HighDefDiscNewsURL), ":", log.Red(err))
}

// Print error just in case
(*scraper).OnError(func(r *colly.Response, err error) {
log.Error.Println(r.Request.URL, "\t", log.White(r.StatusCode), "\nError:", log.Red(err))
Expand Down Expand Up @@ -119,12 +115,13 @@ func HighDefDiscNewsScraper(scraper **colly.Collector, options *config.Options)

})

// Ensure that all requests are completed before exiting
if (*scraper).Async {
(*scraper).Wait()
movieScraper.Wait()
if err := (*scraper).Visit(HighDefDiscNewsURL); err != nil {
log.Error.Println("Can't visit index page", log.White(HighDefDiscNewsURL), ":", log.Red(err))
}

// Ensure that all requests are completed before exiting
(*scraper).Wait()
movieScraper.Wait()
}

// Isolate the movie's title by getting rid of various words on the right.
Expand Down
Loading

0 comments on commit 48ac802

Please sign in to comment.