summaryrefslogtreecommitdiff
path: root/maintenance/copyFileBackend.php
blob: b39ff55eda1b68697dfc8677b3b2b1d0cbbf12c4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
<?php
/**
 * Copy all files in some containers of one backend to another.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 * @ingroup Maintenance
 */

require_once __DIR__ . '/Maintenance.php';

/**
 * Copy all files in one container of one backend to another.
 *
 * This can also be used to re-shard the files for one backend using the
 * config of second backend. The second backend should have the same config
 * as the first, except for it having a different name and different sharding
 * configuration. The backend should be made read-only while this runs.
 * After this script finishes, the old files in the containers can be deleted.
 *
 * @ingroup Maintenance
 */
class CopyFileBackend extends Maintenance {
	/** @var array|null (path sha1 => stat) Pre-computed dst stat entries from listings */
	protected $statCache = null;

	public function __construct() {
		parent::__construct();
		$this->mDescription = "Copy files in one backend to another.";
		$this->addOption( 'src', 'Backend containing the source files', true, true );
		$this->addOption( 'dst', 'Backend where files should be copied to', true, true );
		$this->addOption( 'containers', 'Pipe separated list of containers', true, true );
		$this->addOption( 'subdir', 'Only do items in this child directory', false, true );
		$this->addOption( 'ratefile', 'File to check periodically for batch size', false, true );
		$this->addOption( 'prestat', 'Stat the destination files first (try to use listings)' );
		$this->addOption( 'skiphash', 'Skip SHA-1 sync checks for files' );
		$this->addOption( 'missingonly', 'Only copy files missing from destination listing' );
		$this->addOption( 'syncviadelete', 'Delete destination files missing from source listing' );
		$this->addOption( 'utf8only', 'Skip source files that do not have valid UTF-8 names' );
		$this->setBatchSize( 50 );
	}

	public function execute() {
		$src = FileBackendGroup::singleton()->get( $this->getOption( 'src' ) );
		$dst = FileBackendGroup::singleton()->get( $this->getOption( 'dst' ) );
		$containers = explode( '|', $this->getOption( 'containers' ) );
		$subDir = rtrim( $this->getOption( 'subdir', '' ), '/' );

		$rateFile = $this->getOption( 'ratefile' );

		if ( $this->hasOption( 'utf8only' ) && !extension_loaded( 'mbstring' ) ) {
			$this->error( "Cannot check for UTF-8, mbstring extension missing.", 1 ); // die
		}

		foreach ( $containers as $container ) {
			if ( $subDir != '' ) {
				$backendRel = "$container/$subDir";
				$this->output( "Doing container '$container', directory '$subDir'...\n" );
			} else {
				$backendRel = $container;
				$this->output( "Doing container '$container'...\n" );
			}

			if ( $this->hasOption( 'missingonly' ) ) {
				$this->output( "\tBuilding list of missing files..." );
				$srcPathsRel = $this->getListingDiffRel( $src, $dst, $backendRel );
				$this->output( count( $srcPathsRel ) . " file(s) need to be copied.\n" );
			} else {
				$srcPathsRel = $src->getFileList( array(
					'dir' => $src->getRootStoragePath() . "/$backendRel",
					'adviseStat' => true // avoid HEADs
				) );
				if ( $srcPathsRel === null ) {
					$this->error( "Could not list files in $container.", 1 ); // die
				}
			}

			if ( $this->getOption( 'prestat' ) && !$this->hasOption( 'missingonly' ) ) {
				// Build the stat cache for the destination files
				$this->output( "\tBuilding destination stat cache..." );
				$dstPathsRel = $dst->getFileList( array(
					'dir' => $dst->getRootStoragePath() . "/$backendRel",
					'adviseStat' => true // avoid HEADs
				) );
				if ( $dstPathsRel === null ) {
					$this->error( "Could not list files in $container.", 1 ); // die
				}
				$this->statCache = array();
				foreach ( $dstPathsRel as $dstPathRel ) {
					$path = $dst->getRootStoragePath() . "/$backendRel/$dstPathRel";
					$this->statCache[sha1( $path )] = $dst->getFileStat( array( 'src' => $path ) );
				}
				$this->output( "done [" . count( $this->statCache ) . " file(s)]\n" );
			}

			$this->output( "\tCopying file(s)...\n" );
			$count = 0;
			$batchPaths = array();
			foreach ( $srcPathsRel as $srcPathRel ) {
				// Check up on the rate file periodically to adjust the concurrency
				if ( $rateFile && ( !$count || ( $count % 500 ) == 0 ) ) {
					$this->mBatchSize = max( 1, (int)file_get_contents( $rateFile ) );
					$this->output( "\tBatch size is now {$this->mBatchSize}.\n" );
				}
				$batchPaths[$srcPathRel] = 1; // remove duplicates
				if ( count( $batchPaths ) >= $this->mBatchSize ) {
					$this->copyFileBatch( array_keys( $batchPaths ), $backendRel, $src, $dst );
					$batchPaths = array(); // done
				}
				++$count;
			}
			if ( count( $batchPaths ) ) { // left-overs
				$this->copyFileBatch( array_keys( $batchPaths ), $backendRel, $src, $dst );
				$batchPaths = array(); // done
			}
			$this->output( "\tCopied $count file(s).\n" );

			if ( $this->hasOption( 'syncviadelete' ) ) {
				$this->output( "\tBuilding list of excess destination files..." );
				$delPathsRel = $this->getListingDiffRel( $dst, $src, $backendRel );
				$this->output( count( $delPathsRel ) . " file(s) need to be deleted.\n" );

				$this->output( "\tDeleting file(s)...\n" );
				$count = 0;
				$batchPaths = array();
				foreach ( $delPathsRel as $delPathRel ) {
					// Check up on the rate file periodically to adjust the concurrency
					if ( $rateFile && ( !$count || ( $count % 500 ) == 0 ) ) {
						$this->mBatchSize = max( 1, (int)file_get_contents( $rateFile ) );
						$this->output( "\tBatch size is now {$this->mBatchSize}.\n" );
					}
					$batchPaths[$delPathRel] = 1; // remove duplicates
					if ( count( $batchPaths ) >= $this->mBatchSize ) {
						$this->delFileBatch( array_keys( $batchPaths ), $backendRel, $dst );
						$batchPaths = array(); // done
					}
					++$count;
				}
				if ( count( $batchPaths ) ) { // left-overs
					$this->delFileBatch( array_keys( $batchPaths ), $backendRel, $dst );
					$batchPaths = array(); // done
				}

				$this->output( "\tDeleted $count file(s).\n" );
			}

			if ( $subDir != '' ) {
				$this->output( "Finished container '$container', directory '$subDir'.\n" );
			} else {
				$this->output( "Finished container '$container'.\n" );
			}
		}

		$this->output( "Done.\n" );
	}

	/**
	 * @param FileBackend $src
	 * @param FileBackend $dst
	 * @param string $backendRel
	 * @return array (rel paths in $src minus those in $dst)
	 */
	protected function getListingDiffRel( FileBackend $src, FileBackend $dst, $backendRel ) {
		$srcPathsRel = $src->getFileList( array(
			'dir' => $src->getRootStoragePath() . "/$backendRel" ) );
		if ( $srcPathsRel === null ) {
			$this->error( "Could not list files in source container.", 1 ); // die
		}
		$dstPathsRel = $dst->getFileList( array(
			'dir' => $dst->getRootStoragePath() . "/$backendRel" ) );
		if ( $dstPathsRel === null ) {
			$this->error( "Could not list files in destination container.", 1 ); // die
		}
		// Get the list of destination files
		$relFilesDstSha1 = array();
		foreach ( $dstPathsRel as $dstPathRel ) {
			$relFilesDstSha1[sha1( $dstPathRel )] = 1;
		}
		unset( $dstPathsRel ); // free
		// Get the list of missing files
		$missingPathsRel = array();
		foreach ( $srcPathsRel as $srcPathRel ) {
			if ( !isset( $relFilesDstSha1[sha1( $srcPathRel )] ) ) {
				$missingPathsRel[] = $srcPathRel;
			}
		}
		unset( $srcPathsRel ); // free

		return $missingPathsRel;
	}

	/**
	 * @param array $srcPathsRel
	 * @param string $backendRel
	 * @param FileBackend $src
	 * @param FileBackend $dst
	 * @return void
	 */
	protected function copyFileBatch(
		array $srcPathsRel, $backendRel, FileBackend $src, FileBackend $dst
	) {
		$ops = array();
		$fsFiles = array();
		$copiedRel = array(); // for output message
		$wikiId = $src->getWikiId();

		// Download the batch of source files into backend cache...
		if ( $this->hasOption( 'missingonly' ) ) {
			$srcPaths = array();
			foreach ( $srcPathsRel as $srcPathRel ) {
				$srcPaths[] = $src->getRootStoragePath() . "/$backendRel/$srcPathRel";
			}
			$t_start = microtime( true );
			$fsFiles = $src->getLocalReferenceMulti( array( 'srcs' => $srcPaths, 'latest' => 1 ) );
			$elapsed_ms = floor( ( microtime( true ) - $t_start ) * 1000 );
			$this->output( "\n\tDownloaded these file(s) [{$elapsed_ms}ms]:\n\t" .
				implode( "\n\t", $srcPaths ) . "\n\n" );
		}

		// Determine what files need to be copied over...
		foreach ( $srcPathsRel as $srcPathRel ) {
			$srcPath = $src->getRootStoragePath() . "/$backendRel/$srcPathRel";
			$dstPath = $dst->getRootStoragePath() . "/$backendRel/$srcPathRel";
			if ( $this->hasOption( 'utf8only' ) && !mb_check_encoding( $srcPath, 'UTF-8' ) ) {
				$this->error( "$wikiId: Detected illegal (non-UTF8) path for $srcPath." );
				continue;
			} elseif ( !$this->hasOption( 'missingonly' )
				&& $this->filesAreSame( $src, $dst, $srcPath, $dstPath )
			) {
				$this->output( "\tAlready have $srcPathRel.\n" );
				continue; // assume already copied...
			}
			$fsFile = array_key_exists( $srcPath, $fsFiles )
				? $fsFiles[$srcPath]
				: $src->getLocalReference( array( 'src' => $srcPath, 'latest' => 1 ) );
			if ( !$fsFile ) {
				$src->clearCache( array( $srcPath ) );
				if ( $src->fileExists( array( 'src' => $srcPath, 'latest' => 1 ) ) === false ) {
					$this->error( "$wikiId: File '$srcPath' was listed but does not exist." );
				} else {
					$this->error( "$wikiId: Could not get local copy of $srcPath." );
				}
				continue;
			} elseif ( !$fsFile->exists() ) {
				// FSFileBackends just return the path for getLocalReference() and paths with
				// illegal slashes may get normalized to a different path. This can cause the
				// local reference to not exist...skip these broken files.
				$this->error( "$wikiId: Detected possible illegal path for $srcPath." );
				continue;
			}
			$fsFiles[] = $fsFile; // keep TempFSFile objects alive as needed
			// Note: prepare() is usually fast for key/value backends
			$status = $dst->prepare( array( 'dir' => dirname( $dstPath ), 'bypassReadOnly' => 1 ) );
			if ( !$status->isOK() ) {
				$this->error( print_r( $status->getErrorsArray(), true ) );
				$this->error( "$wikiId: Could not copy $srcPath to $dstPath.", 1 ); // die
			}
			$ops[] = array( 'op' => 'store',
				'src' => $fsFile->getPath(), 'dst' => $dstPath, 'overwrite' => 1 );
			$copiedRel[] = $srcPathRel;
		}

		// Copy in the batch of source files...
		$t_start = microtime( true );
		$status = $dst->doQuickOperations( $ops, array( 'bypassReadOnly' => 1 ) );
		if ( !$status->isOK() ) {
			sleep( 10 ); // wait and retry copy again
			$status = $dst->doQuickOperations( $ops, array( 'bypassReadOnly' => 1 ) );
		}
		$elapsed_ms = floor( ( microtime( true ) - $t_start ) * 1000 );
		if ( !$status->isOK() ) {
			$this->error( print_r( $status->getErrorsArray(), true ) );
			$this->error( "$wikiId: Could not copy file batch.", 1 ); // die
		} elseif ( count( $copiedRel ) ) {
			$this->output( "\n\tCopied these file(s) [{$elapsed_ms}ms]:\n\t" .
				implode( "\n\t", $copiedRel ) . "\n\n" );
		}
	}

	/**
	 * @param array $dstPathsRel
	 * @param string $backendRel
	 * @param FileBackend $dst
	 * @return void
	 */
	protected function delFileBatch(
		array $dstPathsRel, $backendRel, FileBackend $dst
	) {
		$ops = array();
		$deletedRel = array(); // for output message
		$wikiId = $dst->getWikiId();

		// Determine what files need to be copied over...
		foreach ( $dstPathsRel as $dstPathRel ) {
			$dstPath = $dst->getRootStoragePath() . "/$backendRel/$dstPathRel";
			$ops[] = array( 'op' => 'delete', 'src' => $dstPath );
			$deletedRel[] = $dstPathRel;
		}

		// Delete the batch of source files...
		$t_start = microtime( true );
		$status = $dst->doQuickOperations( $ops, array( 'bypassReadOnly' => 1 ) );
		if ( !$status->isOK() ) {
			sleep( 10 ); // wait and retry copy again
			$status = $dst->doQuickOperations( $ops, array( 'bypassReadOnly' => 1 ) );
		}
		$elapsed_ms = floor( ( microtime( true ) - $t_start ) * 1000 );
		if ( !$status->isOK() ) {
			$this->error( print_r( $status->getErrorsArray(), true ) );
			$this->error( "$wikiId: Could not delete file batch.", 1 ); // die
		} elseif ( count( $deletedRel ) ) {
			$this->output( "\n\tDeleted these file(s) [{$elapsed_ms}ms]:\n\t" .
				implode( "\n\t", $deletedRel ) . "\n\n" );
		}
	}

	/**
	 * @param FileBackend $src
	 * @param FileBackend $dst
	 * @param string $sPath
	 * @param string $dPath
	 * @return bool
	 */
	protected function filesAreSame( FileBackend $src, FileBackend $dst, $sPath, $dPath ) {
		$skipHash = $this->hasOption( 'skiphash' );
		$srcStat = $src->getFileStat( array( 'src' => $sPath ) );
		$dPathSha1 = sha1( $dPath );
		if ( $this->statCache !== null ) {
			// All dst files are already in stat cache
			$dstStat = isset( $this->statCache[$dPathSha1] )
				? $this->statCache[$dPathSha1]
				: false;
		} else {
			$dstStat = $dst->getFileStat( array( 'src' => $dPath ) );
		}
		// Initial fast checks to see if files are obviously different
		$sameFast = (
			is_array( $srcStat ) // sanity check that source exists
			&& is_array( $dstStat ) // dest exists
			&& $srcStat['size'] === $dstStat['size']
		);
		// More thorough checks against files
		if ( !$sameFast ) {
			$same = false; // no need to look farther
		} elseif ( isset( $srcStat['md5'] ) && isset( $dstStat['md5'] ) ) {
			// If MD5 was already in the stat info, just use it.
			// This is useful as many objects stores can return this in object listing,
			// so we can use it to avoid slow per-file HEADs.
			$same = ( $srcStat['md5'] === $dstStat['md5'] );
		} elseif ( $skipHash ) {
			// This mode is good for copying to a backup location or resyncing clone
			// backends in FileBackendMultiWrite (since they get writes second, they have
			// higher timestamps). However, when copying the other way, this hits loads of
			// false positives (possibly 100%) and wastes a bunch of time on GETs/PUTs.
			$same = ( $srcStat['mtime'] <= $dstStat['mtime'] );
		} else {
			// This is the slowest method which does many per-file HEADs (unless an object
			// store tracks SHA-1 in listings).
			$same = ( $src->getFileSha1Base36( array( 'src' => $sPath, 'latest' => 1 ) )
				=== $dst->getFileSha1Base36( array( 'src' => $dPath, 'latest' => 1 ) ) );
		}

		return $same;
	}
}

$maintClass = 'CopyFileBackend';
require_once RUN_MAINTENANCE_IF_MAIN;