repl.it
@Scoder12/

imdb titles

Python

No description

fork
loading
Files
  • main.py
  • poetry.lock
  • pyproject.toml
  • requirements.txt
  • titles.csv
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import replit; replit.clear()
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
import requests
import csv
from bs4 import BeautifulSoup as Bs

print("reading file...")
def count_lines():
    lines = 0
    with open("titles.csv", "r") as f:
        for line in f:
            if line.strip():
                lines += 1
    return lines

amnt = count_lines()

if amnt != 0:
    c = input(f"Found {amnt} lines in titles.csv, start from there? (y/n): ")
    if not c.startswith("y"):
        amnt = 0
        print("ok, clearing...")
        with open("titles.csv", "w+") as f:
            f.write("")

def one_ele_list(l):
    if len(l) != 1:
        raise ValueError(f"list is not 1 element: {l}")
    return l[0]


def get_imdb_titles(url):
    #print(f"Requesting...")
    r = requests.get(url)
    if r.status_code != 200:
        print("no good")
        return []
    html = r.text
    #print("Parsing...")
    soup = Bs(html, 'html.parser')
    def check(tag):
        return 'lister-item-header' in tag.parent.attrs.get('class', [])
    link = soup.find("a", {'class': "lister-page-next next-page"})
    assert link
    link = link.attrs['href']
    if link.startswith("/"):
        link = "https://www.imdb.com" + link
    title_eles = soup.find_all(check)

    titles = []

    for t in title_eles:
        if t.name != 'a':
            continue
        s = t.get_text().strip()
        titles.append(s)

    return titles, link

#links = ['https://www.imdb.com/search/title/?genres=action&title_type=feature&explore=genres']
links = ['https://www.imdb.com/search/title/?title_type=feature&genres=action&explore=genres&ref_=adv_nxt']


titles = []

try:
    for l in links:
        while True:
            """
            #print("while start")
            u = urlparse(l)
            qs = parse_qs(u.query)
            qs = {k: one_ele_list(v) for k, v in qs.items()}
            #if "start" not in qs:
            #print(f'Initializing start to amnt + 1: {amnt}')
            qs['start'] = amnt + 1
            #else:
            #    qs['start'] = amnt + 1
            start = int(qs['start'])
            #print(f"{start}")
            qs['start'] = start
            # convert to tuple bc we can't assign attributes
            u = list(u)
            # assign new query string
            u[4] = urlencode(qs)
            # convert back to url
            l = urlunparse(u)
            #print(l)
            """

            t, l = get_imdb_titles(l)
            if t == []:
                break
            titles += t
            #print("titles", len(titles))
            #print(f"\n\nwriting {len(titles)} titles...\n")
            #print("Appending to file...")
            with open("titles.csv", "a") as f:
                w = csv.writer(f)
                for t in titles:
                    w.writerow([t])
            titles = []
            amnt = count_lines()
            
            print(f"{amnt} titles, next url {l}")
            
except KeyboardInterrupt:
    print("stopping parse")
        

#titles = list(dict.fromkeys(titles))

print("writing...")
with open("titles.csv", "a") as f:
    w = csv.writer(f)
    for t in titles:
        w.writerow([t])