Skip to content

Commit 6c29b20

Browse files
committed
Polish the static files import pipeline
1 parent facd0ea commit 6c29b20

File tree

2 files changed

+118
-74
lines changed

2 files changed

+118
-74
lines changed

components/DataLiberation/Importer/StreamImporter.php

Lines changed: 51 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
use Rowbot\URL\URL;
66
use WordPress\ByteStream\ReadStream\FileReadStream;
77
use WordPress\DataLiberation\BlockMarkup\BlockMarkupUrlProcessor;
8+
use WordPress\DataLiberation\DataLiberationException;
89
use WordPress\DataLiberation\EntityReader\EntityReaderIterator;
910
use WordPress\DataLiberation\EntityReader\WXREntityReader;
1011
use WordPress\DataLiberation\URL\WPURL;
@@ -50,6 +51,10 @@ class StreamImporter {
5051
* in the imported content.
5152
*/
5253
protected $site_url_mapping = array();
54+
/**
55+
* A list of URLs to frontload the media files from.
56+
*/
57+
protected $source_media_root_urls = array();
5358
/**
5459
* A list of candidate base URLs that have been spotted in the WXR file.
5560
*
@@ -73,15 +78,15 @@ class StreamImporter {
7378
* it and how.
7479
*
7580
* Once the API consumer decides on the mapping, it can call
76-
* add_site_url_mapping() to tell the importer what to map that domain to.
81+
* add_url_mapping() to tell the importer what to map that domain to.
7782
*/
7883
protected $site_url_mapping_candidates = array();
7984
protected $entity_reader_factory;
8085
/**
8186
* @param array|string|null $query {
8287
* @type string $uploads_path The directory to download the media attachments to.
8388
* E.g. WP_CONTENT_DIR . '/uploads'
84-
* @type string $uploads_url The URL where the media attachments will be accessible
89+
* @type string $new_media_root_url The URL where the media attachments will be accessible
8590
* after the import. E.g. http://127.0.0.1:9400/wp-content/uploads/
8691
* }
8792
*/
@@ -202,7 +207,7 @@ protected function initialize_from_cursor( $cursor ) {
202207
}
203208
if ( ! empty( $cursor['site_url_mapping'] ) ) {
204209
foreach ( $cursor['site_url_mapping'] as $pair ) {
205-
$this->add_site_url_mapping( $pair['from'], $pair['to'] );
210+
$this->add_url_mapping( $pair['from'], $pair['to'] );
206211
}
207212
}
208213
if ( ! empty( $cursor['site_url_mapping_candidates'] ) ) {
@@ -218,14 +223,10 @@ protected function set_source_site_url( $source_site_url ) {
218223
// override that mapping.
219224
$this->site_url_mapping[-1] = array(
220225
'from' => WPURL::parse( $source_site_url ),
221-
'to' => WPURL::parse( $this->options['new_site_url'] ),
226+
'to' => WPURL::parse( $this->options['new_site_content_root_url'] ),
222227
);
223228
}
224229

225-
protected function get_source_site_url() {
226-
return $this->site_url_mapping[-1]['from'];
227-
}
228-
229230
public function get_site_url_mapping_candidates() {
230231
// Only return the candidates that have been spotted in the last index_entities() call.
231232
if ( self::STAGE_INDEX_ENTITIES !== $this->stage ) {
@@ -240,13 +241,17 @@ public function get_site_url_mapping_candidates() {
240241
return $new_candidates;
241242
}
242243

243-
public function add_site_url_mapping( $from, $to ) {
244+
public function add_url_mapping( $from_url, $to_url ) {
244245
$this->site_url_mapping[] = array(
245-
'from' => WPURL::parse( $from ),
246-
'to' => WPURL::parse( $to ),
246+
'from' => WPURL::parse( $from_url ),
247+
'to' => WPURL::parse( $to_url ),
247248
);
248249
}
249250

251+
public function add_source_media_root_url( $source_media_root_url ) {
252+
$this->source_media_root_urls[] = WPURL::parse( $source_media_root_url );
253+
}
254+
250255
public function get_reentrancy_cursor() {
251256
$serialized_site_url_mapping = array();
252257
foreach ( $this->site_url_mapping as $pair ) {
@@ -272,8 +277,11 @@ public function get_reentrancy_cursor() {
272277
}
273278

274279
protected static function parse_options( $options ) {
275-
if ( ! isset( $options['new_site_url'] ) ) {
276-
$options['new_site_url'] = get_site_url();
280+
if ( ! isset( $options['source_site_url'] ) ) {
281+
throw new DataLiberationException( 'The "source_site_url" option is required' );
282+
}
283+
if ( ! isset( $options['new_site_content_root_url'] ) ) {
284+
$options['new_site_content_root_url'] = get_site_url();
277285
}
278286

279287
if ( ! isset( $options['uploads_path'] ) ) {
@@ -282,11 +290,11 @@ protected static function parse_options( $options ) {
282290
// Remove the trailing slash to make concatenation easier later.
283291
$options['uploads_path'] = rtrim( $options['uploads_path'], '/' );
284292

285-
if ( ! isset( $options['uploads_url'] ) ) {
286-
$options['uploads_url'] = rtrim( $options['new_site_url'], '/' ) . '/wp-content/uploads';
293+
if ( ! isset( $options['new_media_root_url'] ) ) {
294+
$options['new_media_root_url'] = get_site_url() . '/wp-content/uploads';
287295
}
288296
// Remove the trailing slash to make concatenation easier later.
289-
$options['uploads_url'] = rtrim( $options['uploads_url'], '/' );
297+
$options['new_media_root_url'] = rtrim( $options['new_media_root_url'], '/' );
290298

291299
return $options;
292300
}
@@ -297,8 +305,18 @@ protected function __construct(
297305
) {
298306
$this->entity_reader_factory = $entity_reader_factory;
299307
$this->options = $options;
300-
if ( isset( $options['default_source_site_url'] ) ) {
301-
$this->set_source_site_url( $options['default_source_site_url'] );
308+
$this->set_source_site_url( $options['source_site_url'] );
309+
310+
if ( isset( $options['source_media_root_urls'] ) ) {
311+
foreach ( $options['source_media_root_urls'] as $source_media_root_url ) {
312+
$this->add_source_media_root_url( $source_media_root_url );
313+
}
314+
}
315+
316+
if ( isset( $options['additional_url_mappings'] ) ) {
317+
foreach ( $options['additional_url_mappings'] as $additional_url_mapping ) {
318+
$this->add_url_mapping( $additional_url_mapping['from'], $additional_url_mapping['to'] );
319+
}
302320
}
303321
}
304322

@@ -477,7 +495,7 @@ protected function index_next_entities( $count = 10000 ) {
477495
return true;
478496
}
479497

480-
public function get_new_site_url_mapping_candidates() {
498+
public function get_new_site_content_root_url_mapping_candidates() {
481499
$candidates = array();
482500
foreach ( $this->site_url_mapping_candidates as $base_url => $status ) {
483501
if ( false === $status ) {
@@ -737,7 +755,7 @@ protected function import_next_entity() {
737755
$data['local_file_path'] ?? $data['slug'] ?? null
738756
);
739757
if ( file_exists( $this->options['uploads_path'] . '/' . $asset_filename ) ) {
740-
$raw_url = $this->options['uploads_url'] . '/' . $asset_filename;
758+
$raw_url = $this->options['new_media_root_url'] . '/' . $asset_filename;
741759
$p->set_url(
742760
$raw_url,
743761
WPURL::parse( $raw_url )
@@ -766,7 +784,7 @@ protected function import_next_entity() {
766784
$p->replace_base_url( $mapping_pair['to'], $mapping_pair['from'] );
767785
}
768786
do_action( 'data_liberation.stream_importer.rewrite_url', $p, [
769-
'base_url_mapping' => $mapping_pair,
787+
'applied_base_url_mapping' => $mapping_pair,
770788
'raw_url_before' => $raw_url_before,
771789
'entity' => $entity,
772790
]);
@@ -912,11 +930,18 @@ protected function rewrite_attachment_url( string $raw_url, $context_path = null
912930
* @TODO: What other asset types are there?
913931
*/
914932
protected function url_processor_matched_asset_url( BlockMarkupUrlProcessor $p ) {
915-
return (
916-
$p->get_tag() === 'IMG' &&
917-
$p->get_inspected_attribute_name() === 'src' &&
918-
$this->is_child_of_a_mapped_url( $p->get_parsed_url() )
919-
);
933+
if ( $p->get_tag() !== 'IMG' ) {
934+
return false;
935+
}
936+
if ( $p->get_inspected_attribute_name() !== 'src' ) {
937+
return false;
938+
}
939+
foreach ( $this->source_media_root_urls as $source_media_root_url ) {
940+
if ( is_child_url_of( $p->get_parsed_url(), $source_media_root_url ) ) {
941+
return true;
942+
}
943+
}
944+
return false;
920945
}
921946

922947
protected function is_child_of_a_mapped_url( $url_detected_in_content ) {

examples/import-static-files/import-markdown-directory.php

Lines changed: 67 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,8 @@ function help_message_and_die($error = false) {
8080
}
8181

8282
define('IMPORT_ROOT_SLUG', '/imported_content/');
83-
define('SOURCE_SITE_URL', 'https://developer.wordpress.org/docs/getting-started/devenv/');
84-
define('TARGET_SITE_URL', get_site_url() . IMPORT_ROOT_SLUG);
85-
$console_writer->write("Target site URL: " . TARGET_SITE_URL . "\n");
83+
define('NEW_SITE_CONTENT_ROOT', get_site_url() . IMPORT_ROOT_SLUG);
84+
$console_writer->write("Target site URL: " . NEW_SITE_CONTENT_ROOT . "\n");
8685

8786
$parser = new Phalcon\Cop\Parser();
8887
$args = $parser->parse($argv);
@@ -128,61 +127,79 @@ function help_message_and_die($error = false) {
128127
exit(1);
129128
}
130129

131-
// Do the work
132-
130+
/**
131+
* Maps a filesystem path to a WordPress-friendly URL path we can assign
132+
* to the imported page.
133+
*
134+
* Example: "/docs/README.md" -> "/docs/readme"
135+
*
136+
* @param string $path The filesystem path to convert
137+
* @return string The WordPress-friendly URL path
138+
*/
133139
function map_file_path_to_wordpress_url( $path ) {
134-
if (str_ends_with($path, '.md')) {
135-
$path = substr($path, 0, -3);
140+
$extensions = array('.md', '.html', '.xhtml');
141+
foreach ($extensions as $ext) {
142+
if (str_ends_with($path, $ext)) {
143+
$path = substr($path, 0, -strlen($ext));
144+
break;
145+
}
136146
}
137147
return strtolower($path);
138148
}
139149

150+
/**
151+
* Transforms links pointing to imported static files (e.g. ./getting-started.md)
152+
* to the format they will have after being imported into WordPress (e.g. /docs/getting-started).
153+
*/
140154
add_action(
141-
// Rewrite URLs in the post content
142155
'data_liberation.stream_importer.rewrite_url',
143-
function ( $processor, $context ) use ( $console_writer, $chrooted_fs ) {
144-
if(!$context['base_url_mapping']) {
156+
function ( $processor, $context ) use ( $chrooted_fs ) {
157+
// If we didn't rewrite the base URL, the URL points outside
158+
// of the imported root directory. Let's keep it as it is.
159+
if(!$context['applied_base_url_mapping']) {
145160
return;
146161
}
147162

148-
$path = $processor->get_parsed_url()->pathname;
149-
$path_before_rewriting = $path;
150-
$site_url_path_prefix = '';
151-
if($context['base_url_mapping']) {
152-
if(str_starts_with($path, $context['base_url_mapping']['to']->pathname)) {
153-
$site_url_path_prefix = rtrim($context['base_url_mapping']['to']->pathname, '/');
154-
$path = substr($path, strlen($site_url_path_prefix));
155-
}
156-
}
163+
$path_original = $processor->get_parsed_url()->pathname;
157164

158-
if($chrooted_fs->is_file($path)) {
159-
$path = map_file_path_to_wordpress_url($path);
165+
// Remove the site path from the URL path and see
166+
// if this URL is pointing to a file that exists in the
167+
// imported directory.
168+
$base_url_path_prefix = rtrim($context['applied_base_url_mapping']['to']->pathname, '/');
169+
$path_relative_to_base = substr($path_original, strlen($base_url_path_prefix));
170+
if(!$chrooted_fs->is_file($path_relative_to_base)) {
171+
return;
160172
}
161173

162-
$path = $site_url_path_prefix . $path;
163-
164-
if($path !== $path_before_rewriting) {
165-
$processor->set_url(
166-
$path,
167-
WPURL::parse($path, $processor->get_parsed_url())
168-
);
169-
}
174+
// Yes! We are linking to an imported page. Let's transform the link
175+
// to a WordPress-friendly URL scheme.
176+
$path_rewritten = $base_url_path_prefix . map_file_path_to_wordpress_url($path_relative_to_base);
177+
$processor->set_url(
178+
$path_rewritten,
179+
WPURL::parse($path_rewritten, $processor->get_parsed_url())
180+
);
170181
},
171182
10,
172183
3
173184
);
174185

186+
/**
187+
* Assigns post_name to every imported static page.
188+
*/
175189
add_filter(
176190
'data_liberation.stream_importer.map_entity',
177-
// wp_insert_post arguments
178-
function ( $entity, $context ) use ( $console_writer ) {
179-
if($entity->get_type() === 'post') {
180-
$data = $entity->get_data();
181-
if(isset($data['local_file_path'])) {
182-
$data['post_name'] = basename(map_file_path_to_wordpress_url($data['local_file_path']));
183-
$entity->set_data($data);
184-
}
191+
function ( $entity ) {
192+
if($entity->get_type() !== 'post') {
193+
return $entity;
194+
}
195+
196+
$data = $entity->get_data();
197+
if(!isset($data['local_file_path'])) {
198+
return $entity;
185199
}
200+
201+
$data['post_name'] = basename(map_file_path_to_wordpress_url($data['local_file_path']));
202+
$entity->set_data($data);
186203
return $entity;
187204
},
188205
10,
@@ -240,18 +257,20 @@ function () use ( $chrooted_fs, $root_id ) {
240257
]
241258
);
242259
}, [
243-
'default_source_site_url' => SOURCE_SITE_URL,
244-
'new_site_url' => TARGET_SITE_URL,
260+
'source_site_url' => 'https://developer.wordpress.org/block-editor/how-to-guides/data-basics/',
261+
'new_site_content_root_url' => NEW_SITE_CONTENT_ROOT,
262+
'source_media_root_urls' => [
263+
'https://developer.wordpress.org/files/',
264+
'https://raw.githubusercontent.com/WordPress/gutenberg/HEAD/docs/',
265+
],
266+
'additional_url_mappings' => [
267+
[
268+
'from' => 'https://developer.wordpress.org/docs/how-to-guides/data-basics/',
269+
'to' => NEW_SITE_CONTENT_ROOT,
270+
],
271+
],
245272
]
246273
);
247-
$importer->add_site_url_mapping(
248-
'https://developer.wordpress.org/block-editor/getting-started/devenv/',
249-
TARGET_SITE_URL
250-
);
251-
$importer->add_site_url_mapping(
252-
'https://developer.wordpress.org/files/',
253-
TARGET_SITE_URL
254-
);
255274

256275
$import_session = ImportSession::create(
257276
array(
@@ -272,7 +291,7 @@ function () use ( $chrooted_fs, $root_id ) {
272291
if($importer->get_stage() === StreamImporter::STAGE_FINISHED) {
273292
$console_writer->write("\n");
274293
$console_writer->write("\033[1;32mImport finished!\033[0m Visit your site at: \n");
275-
$console_writer->write("\033[1;36m" . TARGET_SITE_URL . "\033[0m\n");
294+
$console_writer->write("\033[1;36m" . NEW_SITE_CONTENT_ROOT . "\033[0m\n");
276295
break;
277296
} else if(false === $result) {
278297
$console_writer->write("Failed\n");

0 commit comments

Comments
 (0)