Skip to content

Commit 8373af3

Browse files
authored
Merge pull request #45 from ScrappyCocco/retrieve_search_url
Test of regex to retrieve search url
2 parents e2198d0 + e68b30f commit 8373af3

File tree

3 files changed

+87
-46
lines changed

3 files changed

+87
-46
lines changed

howlongtobeatpy/howlongtobeatpy/HTMLRequests.py

Lines changed: 85 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,70 @@ class SearchModifiers(Enum):
2424
HIDE_DLC = "hide_dlc"
2525

2626

27+
class SearchInformations:
28+
search_url = None
29+
api_key = None
30+
31+
def __init__(self, script_content: str):
32+
self.api_key = self.__extract_api_from_script(script_content)
33+
self.search_url = self.__extract_search_url_script(script_content)
34+
if HTMLRequests.BASE_URL.endswith("/") and self.search_url is not None:
35+
self.search_url = self.search_url.lstrip("/")
36+
37+
def __extract_api_from_script(self, script_content: str):
38+
"""
39+
Function that extract the htlb code to use in the request from the given script
40+
@return: the string of the api key found
41+
"""
42+
# Try multiple find one after the other as hltb keep changing format
43+
# Test 1 - The API Key is in the user id in the request json
44+
user_id_api_key_pattern = r'users\s*:\s*{\s*id\s*:\s*"([^"]+)"'
45+
matches = re.findall(user_id_api_key_pattern, script_content)
46+
if matches:
47+
key = ''.join(matches)
48+
return key
49+
# Test 2 - The API Key is in format fetch("/api/[word here]/".concat("X").concat("Y")...
50+
concat_api_key_pattern = r'\/api\/\w+\/"(?:\.concat\("[^"]*"\))*'
51+
matches = re.findall(concat_api_key_pattern, script_content)
52+
if matches:
53+
matches = str(matches).split('.concat')
54+
matches = [re.sub(r'["\(\)\[\]\']', '', match) for match in matches[1:]]
55+
key = ''.join(matches)
56+
return key
57+
# Unable to find :(
58+
return None
59+
60+
def __extract_search_url_script(self, script_content: str):
61+
"""
62+
Function that extract the htlb search url to append from the script as /api/search
63+
@return: the search url to append
64+
"""
65+
pattern = re.compile(
66+
r'fetch\(\s*["\'](\/api\/[^"\']*)["\']' # Matches the endpoint
67+
r'((?:\s*\.concat\(\s*["\']([^"\']*)["\']\s*\))+)' # Captures concatenated strings
68+
r'\s*,', # Matches up to the comma
69+
re.DOTALL
70+
)
71+
matches = pattern.finditer(script_content)
72+
for match in matches:
73+
endpoint = match.group(1)
74+
concat_calls = match.group(2)
75+
# Extract all concatenated strings
76+
concat_strings = re.findall(r'\.concat\(\s*["\']([^"\']*)["\']\s*\)', concat_calls)
77+
concatenated_str = ''.join(concat_strings)
78+
# Check if the concatenated string matches the known string
79+
if concatenated_str == self.api_key:
80+
return endpoint
81+
# Unable to find :(
82+
return None
83+
84+
2785
class HTMLRequests:
2886
BASE_URL = 'https://howlongtobeat.com/'
2987
REFERER_HEADER = BASE_URL
30-
SEARCH_URL = BASE_URL + "api/s" # should update this to some kind of regex for api/[any alphanumeric characters here] to be more future proof since this keeps changing
3188
GAME_URL = BASE_URL + "game"
89+
# Static search url to use in case it can't be extracted from JS code
90+
SEARCH_URL = BASE_URL + "api/s/"
3291

3392
@staticmethod
3493
def get_search_request_headers():
@@ -46,7 +105,7 @@ def get_search_request_headers():
46105
return headers
47106

48107
@staticmethod
49-
def get_search_request_data(game_name: str, search_modifiers: SearchModifiers, page: int, api_key: str):
108+
def get_search_request_data(game_name: str, search_modifiers: SearchModifiers, page: int, search_info: SearchInformations):
50109
"""
51110
Generate the data payload for the search request
52111
@param game_name: The name of the game to search
@@ -96,8 +155,8 @@ def get_search_request_data(game_name: str, search_modifiers: SearchModifiers, p
96155
}
97156

98157
# If api_key is passed add it to the dict
99-
if api_key is not None:
100-
payload['searchOptions']['users']['id'] = api_key
158+
if search_info is not None and search_info.api_key is not None:
159+
payload['searchOptions']['users']['id'] = search_info.api_key
101160

102161
return json.dumps(payload)
103162

@@ -112,19 +171,21 @@ def send_web_request(game_name: str, search_modifiers: SearchModifiers = SearchM
112171
@return: The HTML code of the research if the request returned 200(OK), None otherwise
113172
"""
114173
headers = HTMLRequests.get_search_request_headers()
115-
api_key_result = HTMLRequests.send_website_request_getcode(False)
116-
if api_key_result is None:
117-
api_key_result = HTMLRequests.send_website_request_getcode(True)
174+
search_info_data = HTMLRequests.send_website_request_getcode(False)
175+
if search_info_data is None or search_info_data.api_key is None:
176+
search_info_data = HTMLRequests.send_website_request_getcode(True)
118177
# Make the request
178+
if search_info_data.search_url is not None:
179+
HTMLRequests.SEARCH_URL = HTMLRequests.BASE_URL + search_info_data.search_url
119180
# The main method currently is the call to the API search URL
120-
search_url_with_key = HTMLRequests.SEARCH_URL + "/" + api_key_result
181+
search_url_with_key = HTMLRequests.SEARCH_URL + search_info_data.api_key
121182
payload = HTMLRequests.get_search_request_data(game_name, search_modifiers, page, None)
122183
resp = requests.post(search_url_with_key, headers=headers, data=payload, timeout=60)
123184
if resp.status_code == 200:
124185
return resp.text
125186
# Try to call with the standard url adding the api key to the user
126187
search_url = HTMLRequests.SEARCH_URL
127-
payload = HTMLRequests.get_search_request_data(game_name, search_modifiers, page, api_key_result)
188+
payload = HTMLRequests.get_search_request_data(game_name, search_modifiers, page, search_info_data)
128189
resp = requests.post(search_url, headers=headers, data=payload, timeout=60)
129190
if resp.status_code == 200:
130191
return resp.text
@@ -141,20 +202,22 @@ async def send_async_web_request(game_name: str, search_modifiers: SearchModifie
141202
@return: The HTML code of the research if the request returned 200(OK), None otherwise
142203
"""
143204
headers = HTMLRequests.get_search_request_headers()
144-
api_key_result = await HTMLRequests.async_send_website_request_getcode(False)
145-
if api_key_result is None:
146-
api_key_result = await HTMLRequests.async_send_website_request_getcode(True)
205+
search_info_data = HTMLRequests.send_website_request_getcode(False)
206+
if search_info_data is None or search_info_data.api_key is None:
207+
search_info_data = HTMLRequests.send_website_request_getcode(True)
147208
# Make the request
209+
if search_info_data.search_url is not None:
210+
HTMLRequests.SEARCH_URL = HTMLRequests.BASE_URL + search_info_data.search_url
148211
# The main method currently is the call to the API search URL
149-
search_url_with_key = HTMLRequests.SEARCH_URL + "/" + api_key_result
212+
search_url_with_key = HTMLRequests.SEARCH_URL + search_info_data.api_key
150213
payload = HTMLRequests.get_search_request_data(game_name, search_modifiers, page, None)
151214
async with aiohttp.ClientSession() as session:
152215
async with session.post(search_url_with_key, headers=headers, data=payload) as resp_with_key:
153216
if resp_with_key is not None and resp_with_key.status == 200:
154217
return await resp_with_key.text()
155218
else:
156219
search_url = HTMLRequests.SEARCH_URL
157-
payload = HTMLRequests.get_search_request_data(game_name, search_modifiers, page, api_key_result)
220+
payload = HTMLRequests.get_search_request_data(game_name, search_modifiers, page, search_info_data)
158221
async with session.post(search_url, headers=headers, data=payload) as resp_user_id:
159222
if resp_user_id is not None and resp_user_id.status == 200:
160223
return await resp_user_id.text()
@@ -240,30 +303,6 @@ async def async_get_game_title(game_id: int):
240303
return HTMLRequests.__cut_game_title(text)
241304
return None
242305

243-
@staticmethod
244-
def extract_api_from_script(script_content: str):
245-
"""
246-
Function that extract the htlb code to use in the request from the given script
247-
@return: the string of the api key found
248-
"""
249-
# Try multiple find one after the other as hltb keep changing format
250-
# Test 1 - The API Key is in the user id in the request json
251-
user_id_api_key_pattern = r'users\s*:\s*{\s*id\s*:\s*"([^"]+)"'
252-
matches = re.findall(user_id_api_key_pattern, script_content)
253-
if matches:
254-
key = ''.join(matches)
255-
return key
256-
# Test 2 - The API Key is in format fetch("/api/[word here]/".concat("X").concat("Y")...
257-
concat_api_key_pattern = r'\/api\/\w+\/"(?:\.concat\("[^"]*"\))*'
258-
matches = re.findall(concat_api_key_pattern, script_content)
259-
if matches:
260-
matches = str(matches).split('.concat')
261-
matches = [re.sub(r'["\(\)\[\]\']', '', match) for match in matches[1:]]
262-
key = ''.join(matches)
263-
return key
264-
# Unable to find :(
265-
return None
266-
267306
@staticmethod
268307
def send_website_request_getcode(parse_all_scripts: bool):
269308
"""
@@ -286,9 +325,10 @@ def send_website_request_getcode(parse_all_scripts: bool):
286325
script_url = HTMLRequests.BASE_URL + script_url
287326
script_resp = requests.get(script_url, headers=headers, timeout=60)
288327
if script_resp.status_code == 200 and script_resp.text is not None:
289-
api_key_result = HTMLRequests.extract_api_from_script(script_resp.text)
290-
if api_key_result is not None:
291-
return api_key_result
328+
search_info = SearchInformations(script_resp.text)
329+
if search_info.api_key is not None:
330+
# The api key is necessary
331+
return search_info
292332
return None
293333

294334
@staticmethod
@@ -317,9 +357,10 @@ async def async_send_website_request_getcode(parse_all_scripts: bool):
317357
async with session.get(script_url, headers=headers) as script_resp:
318358
if script_resp is not None and resp.status == 200:
319359
script_resp_text = await script_resp.text()
320-
api_key_result = HTMLRequests.extract_api_from_script(script_resp_text)
321-
if api_key_result is not None:
322-
return api_key_result
360+
search_info = SearchInformations(script_resp_text)
361+
if search_info.api_key is not None:
362+
# The api key is necessary
363+
return search_info
323364
else:
324365
return None
325366
else:

howlongtobeatpy/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
long_description = fh.read()
55

66
setup(name='howlongtobeatpy',
7-
version='1.0.16',
7+
version='1.0.17',
88
packages=find_packages(exclude=['tests']),
99
description='A Python API for How Long to Beat',
1010
long_description=long_description,

sonar-project.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ sonar.organization=scrappycocco-github
22
sonar.projectKey=ScrappyCocco_HowLongToBeat-PythonAPI
33

44
sonar.projectName=HowLongToBeat-PythonAPI
5-
sonar.projectVersion=1.0.16
5+
sonar.projectVersion=1.0.17
66
sonar.python.version=3.9
77

88
# Path is relative to the sonar-project.properties file. Replace "\" by "/" on Windows.

0 commit comments

Comments
 (0)