202 lines
4.8 KiB
Go
202 lines
4.8 KiB
Go
package scraper
|
|
|
|
import (
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
|
"github.com/gocolly/colly"
|
|
"github.com/jackc/pgx/v5/pgtype"
|
|
"github.com/zepyrshut/rating-orama/internal/sqlc"
|
|
)
|
|
|
|
type Episode struct {
|
|
Season int
|
|
Episode int
|
|
Released time.Time
|
|
Name string
|
|
Plot string
|
|
Rate float32
|
|
VoteCount int
|
|
}
|
|
|
|
func (e Episode) ToEpisodeParams(tvShowID int32) sqlc.CreateEpisodesParams {
|
|
|
|
var date pgtype.Date
|
|
_ = date.Scan(e.Released)
|
|
|
|
return sqlc.CreateEpisodesParams{
|
|
TvShowID: tvShowID,
|
|
Season: int32(e.Season),
|
|
Episode: int32(e.Episode),
|
|
Name: e.Name,
|
|
Released: date,
|
|
Plot: e.Plot,
|
|
AvgRating: e.Rate,
|
|
VoteCount: int32(e.VoteCount),
|
|
}
|
|
}
|
|
|
|
func ScrapeEpisodes(ttImdb string) (string, []Episode) {
|
|
c := colly.NewCollector(
|
|
colly.AllowedDomains("imdb.com", "www.imdb.com"),
|
|
)
|
|
|
|
c.OnRequest(func(r *colly.Request) {
|
|
r.Headers.Set("Accept-Language", "en-US")
|
|
})
|
|
|
|
var allSeasons []Episode
|
|
var seasons []int
|
|
var title string
|
|
|
|
c.OnHTML(os.Getenv("SEASON_SELECTOR"), func(e *colly.HTMLElement) {
|
|
seasonText := strings.TrimSpace(e.Text)
|
|
seasonNum, err := strconv.Atoi(seasonText)
|
|
if err == nil {
|
|
seasons = append(seasons, seasonNum)
|
|
}
|
|
})
|
|
|
|
c.OnHTML(os.Getenv("TITLE_SELECTOR"), func(e *colly.HTMLElement) {
|
|
title = e.Text
|
|
})
|
|
|
|
c.OnScraped(func(r *colly.Response) {
|
|
seasonMap := make(map[int]bool)
|
|
var uniqueSeasons []int
|
|
slog.Info("scraped seasons", "seasons", seasons)
|
|
for _, seasonNum := range seasons {
|
|
if !seasonMap[seasonNum] {
|
|
seasonMap[seasonNum] = true
|
|
uniqueSeasons = append(uniqueSeasons, seasonNum)
|
|
}
|
|
}
|
|
|
|
sort.Ints(uniqueSeasons)
|
|
episodeCollector := c.Clone()
|
|
|
|
episodeCollector.OnResponse(func(r *colly.Response) {
|
|
slog.Info("response", "url", r.Request.URL)
|
|
season := extractEpisodesFromSeason(string(r.Body))
|
|
allSeasons = append(allSeasons, season...)
|
|
})
|
|
|
|
for _, seasonNum := range uniqueSeasons {
|
|
seasonURL := fmt.Sprintf(os.Getenv("IMDB_EPISODES_URL"), ttImdb, seasonNum)
|
|
slog.Info("visiting season", "url", seasonURL)
|
|
_ = episodeCollector.Visit(seasonURL)
|
|
}
|
|
|
|
episodeCollector.Wait()
|
|
})
|
|
|
|
_ = c.Visit(fmt.Sprintf(os.Getenv("VISIT_URL"), ttImdb))
|
|
c.Wait()
|
|
|
|
slog.Info("scraped all seasons", "length", len(allSeasons))
|
|
return title, allSeasons
|
|
}
|
|
|
|
func extractEpisodesFromSeason(data string) []Episode {
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(data))
|
|
if err != nil {
|
|
slog.Error("error parsing html")
|
|
return []Episode{}
|
|
}
|
|
|
|
var episodes []Episode
|
|
doc.Find(os.Getenv("EPISODE_CARD_SELECTOR")).Each(func(i int, s *goquery.Selection) {
|
|
var episode Episode
|
|
|
|
seasonEpisodeTitle := s.Find(os.Getenv("SEASON_EPISODE_AND_TITLE_SELECTOR")).Text()
|
|
episode.Season, episode.Episode, episode.Name = parseSeasonEpisodeTitle(seasonEpisodeTitle)
|
|
|
|
releasedDate := s.Find(os.Getenv("RELEASED_DATE_SELECTOR")).Text()
|
|
episode.Released = parseReleasedDate(releasedDate)
|
|
|
|
plot := s.Find(os.Getenv("PLOT_SELECTOR")).Text()
|
|
if plot == "Add a plot" {
|
|
episode.Plot = ""
|
|
} else {
|
|
episode.Plot = plot
|
|
}
|
|
|
|
starRating := s.Find(os.Getenv("STAR_RATING_SELECTOR")).Text()
|
|
episode.Rate = parseStarRating(starRating)
|
|
|
|
voteCount := s.Find(os.Getenv("VOTE_COUNT_SELECTOR")).Text()
|
|
episode.VoteCount = parseVoteCount(voteCount)
|
|
|
|
episodes = append(episodes, episode)
|
|
})
|
|
|
|
slog.Info("extracted episodes", "length", len(episodes))
|
|
return episodes
|
|
}
|
|
|
|
func parseSeasonEpisodeTitle(input string) (int, int, string) {
|
|
re := regexp.MustCompile(`S(\d+)\.E(\d+)\s*∙\s*(.+)`)
|
|
matches := re.FindStringSubmatch(input)
|
|
if len(matches) != 4 {
|
|
return 0, 0, ""
|
|
}
|
|
|
|
seasonNum, err1 := strconv.Atoi(matches[1])
|
|
episodeNum, err2 := strconv.Atoi(matches[2])
|
|
name := strings.TrimSpace(matches[3])
|
|
|
|
if err1 != nil || err2 != nil {
|
|
return 0, 0, ""
|
|
}
|
|
|
|
return seasonNum, episodeNum, name
|
|
}
|
|
|
|
func parseReleasedDate(releasedDate string) time.Time {
|
|
const layout = "Mon, Jan 2, 2006"
|
|
parsedDate, err := time.Parse(layout, releasedDate)
|
|
if err != nil {
|
|
slog.Error("error parsing date", "date", releasedDate)
|
|
return time.Time{}
|
|
}
|
|
return parsedDate
|
|
}
|
|
|
|
func parseStarRating(starRating string) float32 {
|
|
rating, err := strconv.ParseFloat(starRating, 32)
|
|
if err != nil || rating < 0 || rating > 10 {
|
|
slog.Warn("error parsing rating, out of limits", "rating", starRating)
|
|
return 0
|
|
}
|
|
return float32(rating)
|
|
}
|
|
|
|
func parseVoteCount(voteCount string) int {
|
|
re := regexp.MustCompile(`\(([\d.]+)(K?)\)`)
|
|
matches := re.FindStringSubmatch(voteCount)
|
|
if len(matches) != 3 {
|
|
slog.Error("error parsing vote count", "count", voteCount)
|
|
return 0
|
|
}
|
|
|
|
num, err := strconv.ParseFloat(matches[1], 64)
|
|
if err != nil {
|
|
slog.Error("error parsing vote count", "count", voteCount)
|
|
return 0
|
|
}
|
|
|
|
if matches[2] == "K" {
|
|
num *= 1000
|
|
}
|
|
|
|
return int(num)
|
|
}
|