Result
1 """
2 The purpose of this file is to:
3 1. Interact with the Wikidata API
4 2. Store the results
5 3. Make the results easy to access from other files
6 """
7
8 import requests
9 import logging
10 from dataclasses import dataclass
11 from openlibrary.core.helpers import days_since
12
13 from datetime import datetime
14 import json
15 from openlibrary.core import db
16
17 logger = logging.getLogger("core.wikidata")
18
19 WIKIDATA_API_URL = 'https://www.wikidata.org/w/rest.php/wikibase/v0/entities/items/'
20 WIKIDATA_CACHE_TTL_DAYS = 30
21
22
23 @dataclass
24 class WikidataEntity:
25 """
26 This is the model of the api response from WikiData plus the updated field
27 https://www.wikidata.org/wiki/Wikidata:REST_API
28 """
29
30 id: str
31 type: str
32 labels: dict[str, str]
33 descriptions: dict[str, str]
34 aliases: dict[str, list[str]]
35 statements: dict[str, dict]
36 sitelinks: dict[str, dict]
37 _updated: datetime # This is when we fetched the data, not when the entity was changed in Wikidata
38
39 def get_description(self, language: str = 'en') -> str | None:
40 """If a description isn't available in the requested language default to English"""
41 return self.descriptions.get(language) or self.descriptions.get('en')
42
43 def get_wikipedia_link(self, language: str = 'en') -> tuple[str, str] | None:
44 """
45 Get the Wikipedia URL and language for a given language code.
46 Falls back to English if requested language is unavailable.
47 """
48 requested_wiki = f'{language}wiki'
49 english_wiki = 'enwiki'
50
51 if requested_wiki in self.sitelinks:
52 return self.sitelinks[requested_wiki]['url'], language
53 elif english_wiki in self.sitelinks:
54 return self.sitelinks[english_wiki]['url'], 'en'
55 return None
56
57 @classmethod
58 def from_dict(cls, response: dict, updated: datetime):
59 return cls(
60 **response,
61 _updated=updated,
62 )
63
64 def to_wikidata_api_json_format(self) -> str:
65 """
66 Transforms the dataclass a JSON string like we get from the Wikidata API.
67 This is used for storing the json in the database.
68 """
69 entity_dict = {
70 'id': self.id,
71 'type': self.type,
72 'labels': self.labels,
73 'descriptions': self.descriptions,
74 'aliases': self.aliases,
75 'statements': self.statements,
76 'sitelinks': self.sitelinks,
77 }
78 return json.dumps(entity_dict)
79
80
81 def _cache_expired(entity: WikidataEntity) -> bool:
82 return days_since(entity._updated) > WIKIDATA_CACHE_TTL_DAYS
83
84
85 def get_wikidata_entity(
86 qid: str, bust_cache: bool = False, fetch_missing: bool = False
87 ) -> WikidataEntity | None:
88 """
89 This only supports QIDs, if we want to support PIDs we need to use different endpoints
90 By default this will only use the cache (unless it is expired).
91 This is to avoid overwhelming Wikidata servers with requests from every visit to an author page.
92 bust_cache must be set to True if you want to fetch new items from Wikidata.
93 # TODO: After bulk data imports we should set fetch_missing to true (or remove it).
94 """
95 if bust_cache:
96 return _get_from_web(qid)
97
98 if entity := _get_from_cache(qid):
99 if _cache_expired(entity):
100 return _get_from_web(qid)
101 return entity
102 elif fetch_missing:
103 return _get_from_web(qid)
104
105 return None
106
107
108 def _get_from_web(id: str) -> WikidataEntity | None:
109 response = requests.get(f'{WIKIDATA_API_URL}{id}')
110 if response.status_code == 200:
111 entity = WikidataEntity.from_dict(
112 response=response.json(), updated=datetime.now()
113 )
114 _add_to_cache(entity)
115 return entity
116 else:
117 logger.error(f'Wikidata Response: {response.status_code}, id: {id}')
118 return None
119 # Responses documented here https://doc.wikimedia.org/Wikibase/master/js/rest-api/
120
121
122 def _get_from_cache_by_ids(ids: list[str]) -> list[WikidataEntity]:
123 response = list(
124 db.get_db().query(
125 'select * from wikidata where id IN ($ids)',
126 vars={'ids': ids},
127 )
128 )
129 return [
130 WikidataEntity.from_dict(response=r.data, updated=r.updated) for r in response
131 ]
132
133
134 def _get_from_cache(id: str) -> WikidataEntity | None:
135 """
136 The cache is OpenLibrary's Postgres instead of calling the Wikidata API
137 """
138 if result := _get_from_cache_by_ids([id]):
139 return result[0]
140 return None
141
142
143 def _add_to_cache(entity: WikidataEntity) -> None:
144 # TODO: after we upgrade to postgres 9.5+ we should use upsert here
145 oldb = db.get_db()
146 json_data = entity.to_wikidata_api_json_format()
147
148 if _get_from_cache(entity.id):
149 return oldb.update(
150 "wikidata",
151 where="id=$id",
152 vars={'id': entity.id},
153 data=json_data,
154 updated=entity._updated,
155 )
156 else:
157 # We don't provide the updated column on insert because postgres defaults to the current time
158 return oldb.insert("wikidata", id=entity.id, data=json_data)
159