diff --git a/core/internal/repository/tvshow.go b/core/internal/repository/tvshow.go index 9bb896f..3877528 100644 --- a/core/internal/repository/tvshow.go +++ b/core/internal/repository/tvshow.go @@ -3,8 +3,6 @@ package repository import ( "context" "github.com/jackc/pgx/v5" - "log/slog" - "github.com/zepyrshut/rating-orama/internal/scraper" "github.com/zepyrshut/rating-orama/internal/sqlc" ) @@ -18,11 +16,8 @@ func (r *pgxRepository) CreateTvShowWithEpisodes(ctx context.Context, tvShow sql return err } - slog.Info("episodes lenght", "episodes", len(episodes)) - for _, episode := range episodes { sqlcEpisodeParams := episode.ToEpisodeParams(tvShow.ID) - slog.Info("creating episode", "episode", sqlcEpisodeParams) episode, err := qtx.CreateEpisodes(ctx, sqlcEpisodeParams) if err != nil { return err diff --git a/core/internal/scraper/tvshow.go b/core/internal/scraper/tvshow.go index 3bf4bfe..9b69242 100644 --- a/core/internal/scraper/tvshow.go +++ b/core/internal/scraper/tvshow.go @@ -2,6 +2,7 @@ package scraper import ( "fmt" + "github.com/PuerkitoBio/goquery" "log/slog" "regexp" "sort" @@ -42,12 +43,16 @@ func (e Episode) ToEpisodeParams(tvShowID int32) sqlc.CreateEpisodesParams { } const ( - titleSelector = "h2.sc-b8cc654b-9.dmvgRY" - seasonsSelector = "ul.ipc-tabs a[data-testid='tab-season-entry']" - episodesSelector = "section.sc-1e7f96be-0.ZaQIL" - nextSeasonButtonSelector = "#next-season-btn" - imdbEpisodesURL = "https://www.imdb.com/title/%s/episodes?season=%d" - visitURL = "https://www.imdb.com/title/%s/episodes" + titleSelector = "h2.sc-b8cc654b-9.dmvgRY" + seasonsSelector = "ul.ipc-tabs a[data-testid='tab-season-entry']" + episodeCardSelector = "article.sc-f8507e90-1.cHtpvn.episode-item-wrapper" + seasonEpisodeAndTitleSelector = "div.ipc-title__text" + releasedDateSelector = "span.sc-ccd6e31b-10.dYquTu" + plotSelector = "div.sc-ccd6e31b-11.cVKeME" + starRatingSelector = "span.ipc-rating-star--rating" + voteCountSelector = "span.ipc-rating-star--voteCount" + imdbEpisodesURL = "https://www.imdb.com/title/%s/episodes/?season=%d" + visitURL = "https://www.imdb.com/title/%s/episodes" ) func ScrapeEpisodes(ttImdb string) (string, []Episode) { @@ -87,12 +92,12 @@ func ScrapeEpisodes(ttImdb string) (string, []Episode) { } sort.Ints(uniqueSeasons) - episodeCollector := c.Clone() - episodeCollector.OnHTML(episodesSelector, func(e *colly.HTMLElement) { - seasonEpisodes := extractEpisodesFromSeason(e.Text) - allSeasons = append(allSeasons, seasonEpisodes...) + episodeCollector.OnResponse(func(r *colly.Response) { + slog.Info("response", "url", r.Request.URL) + season := extractEpisodesFromSeason(string(r.Body)) + allSeasons = append(allSeasons, season...) }) for _, seasonNum := range uniqueSeasons { @@ -107,57 +112,102 @@ func ScrapeEpisodes(ttImdb string) (string, []Episode) { c.Visit(fmt.Sprintf(visitURL, ttImdb)) c.Wait() - slog.Info("scraped all seasons", "seasons", allSeasons) + slog.Info("scraped all seasons", "length", len(allSeasons)) return title, allSeasons } func extractEpisodesFromSeason(data string) []Episode { - - slog.Info("extracting episodes", "data", data) - - const pattern = `(S\d+\.E\d+)\s∙\s(.*?)` + - `(Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s` + - `(.*?),\s(\d{4})(.*?)` + - `(\d\.\d{1,2}\/10) \((\d+K)\)Rate` - - re := regexp.MustCompile(pattern) - matches := re.FindAllStringSubmatch(data, -1) - - episodes := make([]Episode, 0, len(matches)) - - slog.Info("matches", "num", len(matches)) - - for _, match := range matches { - var episode Episode - - seasonEpisode := match[1] - - name := strings.TrimSpace(match[2]) - - day := match[3] - dateRest := strings.TrimSpace(match[4]) - year := match[5] - - plot := strings.TrimSpace(match[6]) - rate := match[7] - voteCount := match[8] - - seasonNum := strings.TrimPrefix(strings.Split(seasonEpisode, ".")[0], "S") - episodeNum := strings.TrimPrefix(strings.Split(seasonEpisode, ".")[1], "E") - - votesInt, _ := strconv.Atoi(strings.TrimSuffix(strings.TrimSuffix(voteCount, "K"), "K")) - rateFloat, _ := strconv.ParseFloat(strings.TrimSuffix(rate, "/10"), 32) - - episode.Name = name - episode.Episode, _ = strconv.Atoi(episodeNum) - episode.Season, _ = strconv.Atoi(seasonNum) - episode.Released, _ = time.Parse("Mon, Jan 2, 2006", fmt.Sprintf("%s, %s, %s", day, dateRest, year)) - episode.Plot = plot - episode.Rate = float32(rateFloat) - episode.VoteCount = votesInt * 1000 - - episodes = append(episodes, episode) + doc, err := goquery.NewDocumentFromReader(strings.NewReader(data)) + if err != nil { + slog.Error("error parsing html") + return []Episode{} } + var episodes []Episode + doc.Find(episodeCardSelector).Each(func(i int, s *goquery.Selection) { + var episode Episode + + seasonEpisodeTitle := s.Find(seasonEpisodeAndTitleSelector).Text() + episode.Season, episode.Episode, episode.Name = parseSeasonEpisodeTitle(seasonEpisodeTitle) + + releasedDate := s.Find(releasedDateSelector).Text() + episode.Released = parseReleasedDate(releasedDate) + + plot := s.Find(plotSelector).Text() + if plot == "Add a plot" { + episode.Plot = "" + } else { + episode.Plot = plot + } + + starRating := s.Find(starRatingSelector).Text() + episode.Rate = parseStarRating(starRating) + + voteCount := s.Find(voteCountSelector).Text() + slog.Info("vote count", "count", voteCount) + episode.VoteCount = parseVoteCount(voteCount) + + episodes = append(episodes, episode) + }) + + slog.Info("extracted episodes", "length", len(episodes)) return episodes } + +func parseSeasonEpisodeTitle(input string) (int, int, string) { + re := regexp.MustCompile(`S(\d+)\.E(\d+)\s*∙\s*(.+)`) + matches := re.FindStringSubmatch(input) + if len(matches) != 4 { + return 0, 0, "" + } + + seasonNum, err1 := strconv.Atoi(matches[1]) + episodeNum, err2 := strconv.Atoi(matches[2]) + name := strings.TrimSpace(matches[3]) + + if err1 != nil || err2 != nil { + return 0, 0, "" + } + + return seasonNum, episodeNum, name +} + +func parseReleasedDate(releasedDate string) time.Time { + const layout = "Mon, Jan 2, 2006" + parsedDate, err := time.Parse(layout, releasedDate) + if err != nil { + slog.Error("error parsing date", "date", releasedDate) + return time.Time{} + } + return parsedDate +} + +func parseStarRating(starRating string) float32 { + rating, err := strconv.ParseFloat(starRating, 32) + if err != nil || rating < 0 || rating > 10 { + slog.Warn("error parsing rating, out of limits", "rating", starRating) + return 0 + } + return float32(rating) +} + +func parseVoteCount(voteCount string) int { + re := regexp.MustCompile(`\(([\d.]+)(K?)\)`) + matches := re.FindStringSubmatch(voteCount) + if len(matches) != 3 { + slog.Error("error parsing vote count", "count", voteCount) + return 0 + } + + num, err := strconv.ParseFloat(matches[1], 64) + if err != nil { + slog.Error("error parsing vote count", "count", voteCount) + return 0 + } + + if matches[2] == "K" { + num *= 1000 + } + + return int(num) +} diff --git a/core/internal/scraper/tvshow_test.go b/core/internal/scraper/tvshow_test.go new file mode 100644 index 0000000..640d97b --- /dev/null +++ b/core/internal/scraper/tvshow_test.go @@ -0,0 +1,117 @@ +package scraper + +import ( + "testing" + "time" +) + +func Test_parseSeasonEpisodeTitle(t *testing.T) { + var tests = []struct { + given string + expected struct { + seasonNum int + episodeNum int + name string + } + }{ + {"S5.E1 ∙ Live Free or Die", struct { + seasonNum int + episodeNum int + name string + }{5, 1, "Live Free or Die"}}, + {"S5.E13 ∙ To'hajiilee", struct { + seasonNum int + episodeNum int + name string + }{5, 13, "To'hajiilee"}}, + } + + for _, tt := range tests { + t.Run(tt.given, func(t *testing.T) { + seasonNum, episodeNum, name := parseSeasonEpisodeTitle(tt.given) + if seasonNum != tt.expected.seasonNum || episodeNum != tt.expected.episodeNum || name != tt.expected.name { + t.Errorf("parseSeasonEpisodeTitle(%s): expected %d, %d, %s, actual %d, %d, %s", tt.given, tt.expected.seasonNum, tt.expected.episodeNum, tt.expected.name, seasonNum, episodeNum, name) + } + }) + } +} + +func Test_parseReleasedDate(t *testing.T) { + var tests = []struct { + given string + expected time.Time + }{ + {"", time.Time{}}, + {"1", time.Time{}}, + {"Sun, Feb 3, 2005", time.Date(2005, time.February, 3, 0, 0, 0, 0, time.UTC)}, + {"Mon, Jan 2, 2006", time.Date(2006, time.January, 2, 0, 0, 0, 0, time.UTC)}, + {"Tue, Mar 4, 2007", time.Date(2007, time.March, 4, 0, 0, 0, 0, time.UTC)}, + {"Wed, Apr 5, 2008", time.Date(2008, time.April, 5, 0, 0, 0, 0, time.UTC)}, + {"Thu, May 6, 2009", time.Date(2009, time.May, 6, 0, 0, 0, 0, time.UTC)}, + {"Fri, Jun 7, 2010", time.Date(2010, time.June, 7, 0, 0, 0, 0, time.UTC)}, + {"Sat, Jul 8, 2011", time.Date(2011, time.July, 8, 0, 0, 0, 0, time.UTC)}, + {"Sun, Aug 9, 2012", time.Date(2012, time.August, 9, 0, 0, 0, 0, time.UTC)}, + {"Mon, Sep 10, 2013", time.Date(2013, time.September, 10, 0, 0, 0, 0, time.UTC)}, + {"Tue, Oct 11, 2014", time.Date(2014, time.October, 11, 0, 0, 0, 0, time.UTC)}, + {"Wed, Nov 12, 2015", time.Date(2015, time.November, 12, 0, 0, 0, 0, time.UTC)}, + {"Thu, Dec 13, 2016", time.Date(2016, time.December, 13, 0, 0, 0, 0, time.UTC)}, + } + + for _, tt := range tests { + t.Run(tt.given, func(t *testing.T) { + actual := parseReleasedDate(tt.given) + if actual != tt.expected { + t.Errorf("parseReleasedDate(%s): expected %v, actual %v", tt.given, tt.expected, actual) + } + }) + } +} + +func Test_parseStarRating(t *testing.T) { + var tests = []struct { + given string + expected float32 + }{ + {"1", 1}, + {"1.5", 1.5}, + {"10", 10}, + {"10.5", 0}, + {"0", 0}, + {"999", 0}, + {"hello", 0}, + } + + for _, tt := range tests { + t.Run(tt.given, func(t *testing.T) { + actual := parseStarRating(tt.given) + if actual != tt.expected { + t.Errorf("parseStarRating(%s): expected %f, actual %f", tt.given, tt.expected, actual) + } + }) + } +} + +func Test_parseVoteCount(t *testing.T) { + var tests = []struct { + given string + expected int + }{ + {" (148K)", 148000}, + {" (8K)", 8000}, + {" (12K)", 12000}, + {" (1)", 1}, + {" (10)", 10}, + {" (100)", 100}, + {" (1K)", 1000}, + {" (1.9K)", 1900}, + } + + for _, tt := range tests { + t.Run(tt.given, func(t *testing.T) { + actual := parseVoteCount(tt.given) + if actual != tt.expected { + t.Errorf("parseVoteCount(%s): expected %d, actual %d", tt.given, tt.expected, actual) + } + }) + } +}