summaryrefslogtreecommitdiff
path: root/plugins/Irc/extlib/phergie/Phergie/Plugin/Cocktail/db.php
blob: 2e61dd0bdb4b4f92066a0abd2b3494760941bfab (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
<?php

if (!defined('__DIR__')) {
    define('__DIR__', dirname(__FILE__));
}

// Create database schema
echo 'Creating database', PHP_EOL;
$file = __DIR__ . '/cocktail.db';
if (file_exists($file)) {
    unlink($file);
}
$db = new PDO('sqlite:' . $file);
$db->exec('CREATE TABLE cocktail (name VARCHAR(255), link VARCHAR(255))');
$db->exec('CREATE UNIQUE INDEX cocktail_name ON cocktail (name)');
$insert = $db->prepare('INSERT INTO cocktail (name, link) VALUES (:name, :link)');

// Get raw webtender.com data set
echo 'Downloading webtender.com data set', PHP_EOL;
$start = 1;
do {
    $file = __DIR__ . '/' . $start . '.html';
    if (file_exists($file)) {
        continue;
    }
    copy(
        'http://www.webtender.com/db/browse?level=2&dir=drinks&char=%2A&start=' . $start,
        $file
    );
    if (!isset($limit)) {
        $contents = file_get_contents($file);
        preg_match('/([0-9]+) found/', $contents, $match);
        $limit = $match[1] + (150 - ($match[1] % 150));
    }
    echo 'Got records ', $start, ' - ', min($start + 150, $limit), ' of ', $limit, PHP_EOL;
    $start += 150;
} while ($start < $limit);

// Extract data from data set
$start = 1;
while ($start < $limit) {
    echo 'Processing ', $start, ' - ', min($start + 150, $limit), ' of ', $limit, PHP_EOL;

    $file = __DIR__ . '/' . $start . '.html';
    $contents = file_get_contents($file);
    $contents = tidy_repair_string($contents);
    libxml_use_internal_errors(true);
    $doc = new DOMDocument;
    $doc->loadHTML($contents);
    libxml_clear_errors();
    $xpath = new DOMXPath($doc);

    $cocktails = $xpath->query('//li/a');
    $db->beginTransaction();
    foreach ($cocktails as $cocktail) {
        $name = $cocktail->nodeValue;
        $name = preg_replace('/ The$|^The |\s*\([^)]+\)\s*| #[0-9]+$/', '', $name);
        $name = html_entity_decode($name);
        $link = 'http://www.webtender.com' . $cocktail->getAttribute('href');
        $insert->execute(array($name, $link));
    }
    $db->commit();

    $start += 150;
}

// Clean up
echo 'Cleaning up', PHP_EOL;
$start = 1;
while ($start < $limit) {
    $file = __DIR__ . '/' . $start . '.html';
    unlink($file);
    $start += 150;
}