improve data extraction, parser and added tests
This commit is contained in:
parent
e541681dd1
commit
8c1f642e33
@ -3,8 +3,6 @@ package repository
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"github.com/jackc/pgx/v5"
|
"github.com/jackc/pgx/v5"
|
||||||
"log/slog"
|
|
||||||
|
|
||||||
"github.com/zepyrshut/rating-orama/internal/scraper"
|
"github.com/zepyrshut/rating-orama/internal/scraper"
|
||||||
"github.com/zepyrshut/rating-orama/internal/sqlc"
|
"github.com/zepyrshut/rating-orama/internal/sqlc"
|
||||||
)
|
)
|
||||||
@ -18,11 +16,8 @@ func (r *pgxRepository) CreateTvShowWithEpisodes(ctx context.Context, tvShow sql
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Info("episodes lenght", "episodes", len(episodes))
|
|
||||||
|
|
||||||
for _, episode := range episodes {
|
for _, episode := range episodes {
|
||||||
sqlcEpisodeParams := episode.ToEpisodeParams(tvShow.ID)
|
sqlcEpisodeParams := episode.ToEpisodeParams(tvShow.ID)
|
||||||
slog.Info("creating episode", "episode", sqlcEpisodeParams)
|
|
||||||
episode, err := qtx.CreateEpisodes(ctx, sqlcEpisodeParams)
|
episode, err := qtx.CreateEpisodes(ctx, sqlcEpisodeParams)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
|||||||
@ -2,6 +2,7 @@ package scraper
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"github.com/PuerkitoBio/goquery"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"regexp"
|
"regexp"
|
||||||
"sort"
|
"sort"
|
||||||
@ -44,9 +45,13 @@ func (e Episode) ToEpisodeParams(tvShowID int32) sqlc.CreateEpisodesParams {
|
|||||||
const (
|
const (
|
||||||
titleSelector = "h2.sc-b8cc654b-9.dmvgRY"
|
titleSelector = "h2.sc-b8cc654b-9.dmvgRY"
|
||||||
seasonsSelector = "ul.ipc-tabs a[data-testid='tab-season-entry']"
|
seasonsSelector = "ul.ipc-tabs a[data-testid='tab-season-entry']"
|
||||||
episodesSelector = "section.sc-1e7f96be-0.ZaQIL"
|
episodeCardSelector = "article.sc-f8507e90-1.cHtpvn.episode-item-wrapper"
|
||||||
nextSeasonButtonSelector = "#next-season-btn"
|
seasonEpisodeAndTitleSelector = "div.ipc-title__text"
|
||||||
imdbEpisodesURL = "https://www.imdb.com/title/%s/episodes?season=%d"
|
releasedDateSelector = "span.sc-ccd6e31b-10.dYquTu"
|
||||||
|
plotSelector = "div.sc-ccd6e31b-11.cVKeME"
|
||||||
|
starRatingSelector = "span.ipc-rating-star--rating"
|
||||||
|
voteCountSelector = "span.ipc-rating-star--voteCount"
|
||||||
|
imdbEpisodesURL = "https://www.imdb.com/title/%s/episodes/?season=%d"
|
||||||
visitURL = "https://www.imdb.com/title/%s/episodes"
|
visitURL = "https://www.imdb.com/title/%s/episodes"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -87,12 +92,12 @@ func ScrapeEpisodes(ttImdb string) (string, []Episode) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
sort.Ints(uniqueSeasons)
|
sort.Ints(uniqueSeasons)
|
||||||
|
|
||||||
episodeCollector := c.Clone()
|
episodeCollector := c.Clone()
|
||||||
|
|
||||||
episodeCollector.OnHTML(episodesSelector, func(e *colly.HTMLElement) {
|
episodeCollector.OnResponse(func(r *colly.Response) {
|
||||||
seasonEpisodes := extractEpisodesFromSeason(e.Text)
|
slog.Info("response", "url", r.Request.URL)
|
||||||
allSeasons = append(allSeasons, seasonEpisodes...)
|
season := extractEpisodesFromSeason(string(r.Body))
|
||||||
|
allSeasons = append(allSeasons, season...)
|
||||||
})
|
})
|
||||||
|
|
||||||
for _, seasonNum := range uniqueSeasons {
|
for _, seasonNum := range uniqueSeasons {
|
||||||
@ -107,57 +112,102 @@ func ScrapeEpisodes(ttImdb string) (string, []Episode) {
|
|||||||
c.Visit(fmt.Sprintf(visitURL, ttImdb))
|
c.Visit(fmt.Sprintf(visitURL, ttImdb))
|
||||||
c.Wait()
|
c.Wait()
|
||||||
|
|
||||||
slog.Info("scraped all seasons", "seasons", allSeasons)
|
slog.Info("scraped all seasons", "length", len(allSeasons))
|
||||||
return title, allSeasons
|
return title, allSeasons
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractEpisodesFromSeason(data string) []Episode {
|
func extractEpisodesFromSeason(data string) []Episode {
|
||||||
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(data))
|
||||||
slog.Info("extracting episodes", "data", data)
|
if err != nil {
|
||||||
|
slog.Error("error parsing html")
|
||||||
const pattern = `(S\d+\.E\d+)\s∙\s(.*?)` +
|
return []Episode{}
|
||||||
`(Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s` +
|
|
||||||
`(.*?),\s(\d{4})(.*?)` +
|
|
||||||
`(\d\.\d{1,2}\/10) \((\d+K)\)Rate`
|
|
||||||
|
|
||||||
re := regexp.MustCompile(pattern)
|
|
||||||
matches := re.FindAllStringSubmatch(data, -1)
|
|
||||||
|
|
||||||
episodes := make([]Episode, 0, len(matches))
|
|
||||||
|
|
||||||
slog.Info("matches", "num", len(matches))
|
|
||||||
|
|
||||||
for _, match := range matches {
|
|
||||||
var episode Episode
|
|
||||||
|
|
||||||
seasonEpisode := match[1]
|
|
||||||
|
|
||||||
name := strings.TrimSpace(match[2])
|
|
||||||
|
|
||||||
day := match[3]
|
|
||||||
dateRest := strings.TrimSpace(match[4])
|
|
||||||
year := match[5]
|
|
||||||
|
|
||||||
plot := strings.TrimSpace(match[6])
|
|
||||||
rate := match[7]
|
|
||||||
voteCount := match[8]
|
|
||||||
|
|
||||||
seasonNum := strings.TrimPrefix(strings.Split(seasonEpisode, ".")[0], "S")
|
|
||||||
episodeNum := strings.TrimPrefix(strings.Split(seasonEpisode, ".")[1], "E")
|
|
||||||
|
|
||||||
votesInt, _ := strconv.Atoi(strings.TrimSuffix(strings.TrimSuffix(voteCount, "K"), "K"))
|
|
||||||
rateFloat, _ := strconv.ParseFloat(strings.TrimSuffix(rate, "/10"), 32)
|
|
||||||
|
|
||||||
episode.Name = name
|
|
||||||
episode.Episode, _ = strconv.Atoi(episodeNum)
|
|
||||||
episode.Season, _ = strconv.Atoi(seasonNum)
|
|
||||||
episode.Released, _ = time.Parse("Mon, Jan 2, 2006", fmt.Sprintf("%s, %s, %s", day, dateRest, year))
|
|
||||||
episode.Plot = plot
|
|
||||||
episode.Rate = float32(rateFloat)
|
|
||||||
episode.VoteCount = votesInt * 1000
|
|
||||||
|
|
||||||
episodes = append(episodes, episode)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var episodes []Episode
|
||||||
|
doc.Find(episodeCardSelector).Each(func(i int, s *goquery.Selection) {
|
||||||
|
var episode Episode
|
||||||
|
|
||||||
|
seasonEpisodeTitle := s.Find(seasonEpisodeAndTitleSelector).Text()
|
||||||
|
episode.Season, episode.Episode, episode.Name = parseSeasonEpisodeTitle(seasonEpisodeTitle)
|
||||||
|
|
||||||
|
releasedDate := s.Find(releasedDateSelector).Text()
|
||||||
|
episode.Released = parseReleasedDate(releasedDate)
|
||||||
|
|
||||||
|
plot := s.Find(plotSelector).Text()
|
||||||
|
if plot == "Add a plot" {
|
||||||
|
episode.Plot = ""
|
||||||
|
} else {
|
||||||
|
episode.Plot = plot
|
||||||
|
}
|
||||||
|
|
||||||
|
starRating := s.Find(starRatingSelector).Text()
|
||||||
|
episode.Rate = parseStarRating(starRating)
|
||||||
|
|
||||||
|
voteCount := s.Find(voteCountSelector).Text()
|
||||||
|
slog.Info("vote count", "count", voteCount)
|
||||||
|
episode.VoteCount = parseVoteCount(voteCount)
|
||||||
|
|
||||||
|
episodes = append(episodes, episode)
|
||||||
|
})
|
||||||
|
|
||||||
|
slog.Info("extracted episodes", "length", len(episodes))
|
||||||
return episodes
|
return episodes
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseSeasonEpisodeTitle(input string) (int, int, string) {
|
||||||
|
re := regexp.MustCompile(`S(\d+)\.E(\d+)\s*∙\s*(.+)`)
|
||||||
|
matches := re.FindStringSubmatch(input)
|
||||||
|
if len(matches) != 4 {
|
||||||
|
return 0, 0, ""
|
||||||
|
}
|
||||||
|
|
||||||
|
seasonNum, err1 := strconv.Atoi(matches[1])
|
||||||
|
episodeNum, err2 := strconv.Atoi(matches[2])
|
||||||
|
name := strings.TrimSpace(matches[3])
|
||||||
|
|
||||||
|
if err1 != nil || err2 != nil {
|
||||||
|
return 0, 0, ""
|
||||||
|
}
|
||||||
|
|
||||||
|
return seasonNum, episodeNum, name
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseReleasedDate(releasedDate string) time.Time {
|
||||||
|
const layout = "Mon, Jan 2, 2006"
|
||||||
|
parsedDate, err := time.Parse(layout, releasedDate)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("error parsing date", "date", releasedDate)
|
||||||
|
return time.Time{}
|
||||||
|
}
|
||||||
|
return parsedDate
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseStarRating(starRating string) float32 {
|
||||||
|
rating, err := strconv.ParseFloat(starRating, 32)
|
||||||
|
if err != nil || rating < 0 || rating > 10 {
|
||||||
|
slog.Warn("error parsing rating, out of limits", "rating", starRating)
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return float32(rating)
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseVoteCount(voteCount string) int {
|
||||||
|
re := regexp.MustCompile(`\(([\d.]+)(K?)\)`)
|
||||||
|
matches := re.FindStringSubmatch(voteCount)
|
||||||
|
if len(matches) != 3 {
|
||||||
|
slog.Error("error parsing vote count", "count", voteCount)
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
num, err := strconv.ParseFloat(matches[1], 64)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("error parsing vote count", "count", voteCount)
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
if matches[2] == "K" {
|
||||||
|
num *= 1000
|
||||||
|
}
|
||||||
|
|
||||||
|
return int(num)
|
||||||
|
}
|
||||||
|
|||||||
117
core/internal/scraper/tvshow_test.go
Normal file
117
core/internal/scraper/tvshow_test.go
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
package scraper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func Test_parseSeasonEpisodeTitle(t *testing.T) {
|
||||||
|
var tests = []struct {
|
||||||
|
given string
|
||||||
|
expected struct {
|
||||||
|
seasonNum int
|
||||||
|
episodeNum int
|
||||||
|
name string
|
||||||
|
}
|
||||||
|
}{
|
||||||
|
{"S5.E1 ∙ Live Free or Die", struct {
|
||||||
|
seasonNum int
|
||||||
|
episodeNum int
|
||||||
|
name string
|
||||||
|
}{5, 1, "Live Free or Die"}},
|
||||||
|
{"S5.E13 ∙ To'hajiilee", struct {
|
||||||
|
seasonNum int
|
||||||
|
episodeNum int
|
||||||
|
name string
|
||||||
|
}{5, 13, "To'hajiilee"}},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.given, func(t *testing.T) {
|
||||||
|
seasonNum, episodeNum, name := parseSeasonEpisodeTitle(tt.given)
|
||||||
|
if seasonNum != tt.expected.seasonNum || episodeNum != tt.expected.episodeNum || name != tt.expected.name {
|
||||||
|
t.Errorf("parseSeasonEpisodeTitle(%s): expected %d, %d, %s, actual %d, %d, %s", tt.given, tt.expected.seasonNum, tt.expected.episodeNum, tt.expected.name, seasonNum, episodeNum, name)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test_parseReleasedDate(t *testing.T) {
|
||||||
|
var tests = []struct {
|
||||||
|
given string
|
||||||
|
expected time.Time
|
||||||
|
}{
|
||||||
|
{"", time.Time{}},
|
||||||
|
{"1", time.Time{}},
|
||||||
|
{"Sun, Feb 3, 2005", time.Date(2005, time.February, 3, 0, 0, 0, 0, time.UTC)},
|
||||||
|
{"Mon, Jan 2, 2006", time.Date(2006, time.January, 2, 0, 0, 0, 0, time.UTC)},
|
||||||
|
{"Tue, Mar 4, 2007", time.Date(2007, time.March, 4, 0, 0, 0, 0, time.UTC)},
|
||||||
|
{"Wed, Apr 5, 2008", time.Date(2008, time.April, 5, 0, 0, 0, 0, time.UTC)},
|
||||||
|
{"Thu, May 6, 2009", time.Date(2009, time.May, 6, 0, 0, 0, 0, time.UTC)},
|
||||||
|
{"Fri, Jun 7, 2010", time.Date(2010, time.June, 7, 0, 0, 0, 0, time.UTC)},
|
||||||
|
{"Sat, Jul 8, 2011", time.Date(2011, time.July, 8, 0, 0, 0, 0, time.UTC)},
|
||||||
|
{"Sun, Aug 9, 2012", time.Date(2012, time.August, 9, 0, 0, 0, 0, time.UTC)},
|
||||||
|
{"Mon, Sep 10, 2013", time.Date(2013, time.September, 10, 0, 0, 0, 0, time.UTC)},
|
||||||
|
{"Tue, Oct 11, 2014", time.Date(2014, time.October, 11, 0, 0, 0, 0, time.UTC)},
|
||||||
|
{"Wed, Nov 12, 2015", time.Date(2015, time.November, 12, 0, 0, 0, 0, time.UTC)},
|
||||||
|
{"Thu, Dec 13, 2016", time.Date(2016, time.December, 13, 0, 0, 0, 0, time.UTC)},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.given, func(t *testing.T) {
|
||||||
|
actual := parseReleasedDate(tt.given)
|
||||||
|
if actual != tt.expected {
|
||||||
|
t.Errorf("parseReleasedDate(%s): expected %v, actual %v", tt.given, tt.expected, actual)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test_parseStarRating(t *testing.T) {
|
||||||
|
var tests = []struct {
|
||||||
|
given string
|
||||||
|
expected float32
|
||||||
|
}{
|
||||||
|
{"1", 1},
|
||||||
|
{"1.5", 1.5},
|
||||||
|
{"10", 10},
|
||||||
|
{"10.5", 0},
|
||||||
|
{"0", 0},
|
||||||
|
{"999", 0},
|
||||||
|
{"hello", 0},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.given, func(t *testing.T) {
|
||||||
|
actual := parseStarRating(tt.given)
|
||||||
|
if actual != tt.expected {
|
||||||
|
t.Errorf("parseStarRating(%s): expected %f, actual %f", tt.given, tt.expected, actual)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test_parseVoteCount(t *testing.T) {
|
||||||
|
var tests = []struct {
|
||||||
|
given string
|
||||||
|
expected int
|
||||||
|
}{
|
||||||
|
{" (148K)", 148000},
|
||||||
|
{" (8K)", 8000},
|
||||||
|
{" (12K)", 12000},
|
||||||
|
{" (1)", 1},
|
||||||
|
{" (10)", 10},
|
||||||
|
{" (100)", 100},
|
||||||
|
{" (1K)", 1000},
|
||||||
|
{" (1.9K)", 1900},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.given, func(t *testing.T) {
|
||||||
|
actual := parseVoteCount(tt.given)
|
||||||
|
if actual != tt.expected {
|
||||||
|
t.Errorf("parseVoteCount(%s): expected %d, actual %d", tt.given, tt.expected, actual)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user