| | 235 | /** |
| | 236 | * Get Wikidata IDs. For any unknown IDs, resolve them (normalize and get redirects), |
| | 237 | * and try getting Wikidata IDs again |
| | 238 | */ |
| | 239 | private Map<String, String> resolveWikidataItems(List<String> articles) { |
| | 240 | final Map<String, String> result = getWikidataForArticles0(articles); |
| | 241 | List<String> unresolved = new ArrayList<>(); |
| | 242 | for (String title : articles) { |
| | 243 | if (!result.containsKey(title)) { |
| | 244 | unresolved.add(title); |
| | 245 | } |
| | 246 | } |
| | 247 | if (!unresolved.isEmpty()) { |
| | 248 | final Map<String, String> unresmap = resolveRedirectsForArticles(unresolved); |
| | 249 | final Map<String, String> result2 = getWikidataForArticles0(new ArrayList<>(unresmap.values())); |
| | 250 | for (String original : unresmap.keySet()) { |
| | 251 | final String resolved = unresmap.get(original); |
| | 252 | if (result2.containsKey(resolved)) { |
| | 253 | result.put(original, result2.get(resolved)); |
| | 254 | } |
| | 255 | } |
| | 256 | } |
| | 257 | return result; |
| | 258 | } |
| | 259 | |
| | 289 | /** |
| | 290 | * Given a list of wikipedia titles, returns a map of corresponding normalized title names, |
| | 291 | * or if the title is a redirect page, the result is the redirect target. |
| | 292 | * @todo we should also use this function to normalize existing "wikipedia" tags |
| | 293 | */ |
| | 294 | private Map<String, String> resolveRedirectsForArticles(List<String> articles) { |
| | 295 | try { |
| | 296 | final String url = "https://" + wikipediaLang + ".wikipedia.org/w/api.php" + |
| | 297 | "?action=query" + |
| | 298 | "&redirects" + |
| | 299 | "&format=xml" + |
| | 300 | "&titles=" + articles.stream().map(Utils::encodeUrl).collect(Collectors.joining("|")); |
| | 301 | final Map<String, String> result = new TreeMap<>(), result2 = new TreeMap<>(); |
| | 302 | try (final InputStream in = connect(url).getContent()) { |
| | 303 | final Document xml = newDocumentBuilder().parse(in); |
| | 304 | |
| | 305 | // Add both redirects and normalization results to the same map |
| | 306 | X_PATH.evaluateNodes("//r", xml).forEach(node -> { |
| | 307 | result.put(X_PATH.evaluateString("./@from", node), X_PATH.evaluateString("./@to", node)); |
| | 308 | }); |
| | 309 | X_PATH.evaluateNodes("//n", xml).forEach(node -> { |
| | 310 | final String to = X_PATH.evaluateString("./@to", node); |
| | 311 | result.put(X_PATH.evaluateString("./@from", node), result.getOrDefault(to, to)); |
| | 312 | }); |
| | 313 | } |
| | 314 | // We should only return those keys that were originally requested, excluding titles |
| | 315 | // that are both normalized and redirected |
| | 316 | for (String title : articles) { |
| | 317 | if (result.containsKey(title)) { |
| | 318 | result2.put(title, result.get(title)); |
| | 319 | } |
| | 320 | } |
| | 321 | return result2; |
| | 322 | } catch (Exception ex) { |
| | 323 | throw new RuntimeException(ex); |
| | 324 | } |
| | 325 | } |
| | 326 | |