Skip to content

Commit db83efe

Browse files
committed
Flow improvements – base URL for FilesystemEntityReader, preserve metadata from parsed files
1 parent 6c29b20 commit db83efe

File tree

16 files changed

+309
-72
lines changed

16 files changed

+309
-72
lines changed

components/DataLiberation/EntityReader/BlocksWithMetadataEntityReader.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ public function next_entity() {
4646
$all_metadata = $this->metadata;
4747
$post_fields = array();
4848
$other_metadata = array();
49+
4950
foreach ( $all_metadata as $key => $values ) {
5051
if ( in_array( $key, ImportEntity::POST_FIELDS, true ) ) {
5152
$post_fields[ $key ] = $values[0];
@@ -56,6 +57,7 @@ public function next_entity() {
5657

5758
$post_fields['post_id'] = $this->post_id;
5859
$post_fields['post_content'] = $this->block_markup;
60+
$post_fields['parsed_metadata'] = $all_metadata;
5961

6062
// In Markdown, the frontmatter title can be a worse title candidate than
6163
// the first H1 block. In block markup exports, it will be the opposite.

components/DataLiberation/EntityReader/FilesystemEntityReader.php

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
use WordPress\DataLiberation\DataFormatConsumer\MarkupProcessorConsumer;
77
use WordPress\DataLiberation\ImportEntity;
88
use WordPress\DataLiberation\Importer\ImportUtils;
9+
use WordPress\DataLiberation\URL\WPURL;
910
use WordPress\Filesystem\Filesystem;
1011
use WordPress\Filesystem\Visitor\FilesystemVisitor;
1112
use WordPress\Markdown\MarkdownConsumer;
@@ -29,6 +30,7 @@
2930
* 'first_post_id' => 100,
3031
* 'filter_pattern' => '/\.md$/',
3132
* 'index_file_pattern' => 'index\.md',
33+
* 'base_url' => 'https://example.com',
3234
* ];
3335
* $reader = FilesystemEntityReader::create($filesystem, $options);
3436
* while ($reader->next_filesystem_node()) {
@@ -152,6 +154,13 @@ class FilesystemEntityReader implements EntityReader {
152154
*/
153155
private $finished = false;
154156

157+
/**
158+
* The root URL of the imported site.
159+
*
160+
* @var string
161+
*/
162+
private $base_url;
163+
155164
/**
156165
* Initializes the reader with filesystem and options.
157166
*
@@ -182,6 +191,9 @@ public function __construct(
182191
if ( 1 === $options['first_post_id'] ) {
183192
throw new \InvalidArgumentException( 'First node ID must be greater than 1' );
184193
}
194+
if ( ! isset( $options['base_url'] ) ) {
195+
throw new \InvalidArgumentException( 'The "base_url" option is required. It should contain the root URL of the imported site.' );
196+
}
185197

186198
$this->fs = $filesystem;
187199
$this->file_visitor = new FilesystemVisitor( $filesystem );
@@ -190,6 +202,7 @@ public function __construct(
190202
$this->next_post_id = $options['first_post_id'];
191203
$this->filter_pattern = $options['filter_pattern'] ?? '#\.(?:md|html|xhtml|png|jpg|jpeg|gif|svg|webp|mp4)$#';
192204
$this->index_file_pattern = $options['index_file_pattern'] ?? '#^index\.[a-z]+$#';
205+
$this->base_url = $options['base_url'];
193206
if ( isset( $options['root_parent_id'] ) ) {
194207
$this->parent_ids[-1] = $options['root_parent_id'];
195208
}
@@ -243,6 +256,7 @@ public function next_entity(): bool {
243256
'post_type' => $this->post_type,
244257
'guid' => $post_tree_node['local_file_path'],
245258
'local_file_path' => $post_tree_node['local_file_path'],
259+
'link' => WPURL::append_path( $this->base_url, $post_tree_node['local_file_path'] ),
246260
);
247261
if ( $post_tree_node['type'] === 'file' ) {
248262
$extension = pathinfo( $post_tree_node['local_file_path'], PATHINFO_EXTENSION );
@@ -277,7 +291,7 @@ public function next_entity(): bool {
277291
$result = new BlocksWithMetadata( '', array() );
278292
break;
279293
}
280-
} elseif ( $post_tree_node['type'] === 'file_placeholder' ) {
294+
} elseif ( $post_tree_node['type'] === 'index_file_placeholder' ) {
281295
$result = new BlocksWithMetadata( '', array() );
282296
$metadata['post_title'] = ImportUtils::slug_to_title( basename( $post_tree_node['local_file_path'] ) );
283297
}
@@ -383,11 +397,12 @@ private function next_filesystem_node() {
383397
// Let's create a fake page just to have something in the page tree.
384398
$this->parent_ids[ $depth ] = $this->emit_filesystem_node(
385399
array(
386-
'type' => 'file_placeholder',
400+
'type' => 'index_file_placeholder',
387401
'local_file_path' => $dir,
388402
'parent_id' => $parent_id,
389403
)
390404
);
405+
391406
// We're no longer looking for a directory index.
392407
$this->pending_directory_index = null;
393408
} else {

components/DataLiberation/Importer/ImportSession.php

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,18 @@ public function count_unfinished_frontloading_stubs() {
315315
);
316316
}
317317

318+
public function mark_frontloading_errors_as_ignored() {
319+
global $wpdb;
320+
$wpdb->update(
321+
$wpdb->posts,
322+
array('post_status' => self::FRONTLOAD_STATUS_IGNORED),
323+
array(
324+
'post_type' => 'frontloading_stub',
325+
// 'post_status !=' => self::FRONTLOAD_STATUS_SUCCEEDED,
326+
)
327+
);
328+
}
329+
318330
public function get_frontloading_stubs( $options = array() ) {
319331
$query = new WP_Query(
320332
array(

components/DataLiberation/Importer/StreamImporter.php

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
namespace WordPress\DataLiberation\Importer;
44

5-
use Rowbot\URL\URL;
65
use WordPress\ByteStream\ReadStream\FileReadStream;
76
use WordPress\DataLiberation\BlockMarkup\BlockMarkupUrlProcessor;
87
use WordPress\DataLiberation\DataLiberationException;
@@ -291,11 +290,15 @@ protected static function parse_options( $options ) {
291290
$options['uploads_path'] = rtrim( $options['uploads_path'], '/' );
292291

293292
if ( ! isset( $options['new_media_root_url'] ) ) {
294-
$options['new_media_root_url'] = get_site_url() . '/wp-content/uploads';
293+
$options['new_media_root_url'] = rtrim( get_site_url(), '/' ) . '/wp-content/uploads';
295294
}
296295
// Remove the trailing slash to make concatenation easier later.
297296
$options['new_media_root_url'] = rtrim( $options['new_media_root_url'], '/' );
298297

298+
if ( ! isset( $options['index_batch_size'] ) ) {
299+
$options['index_batch_size'] = 10000;
300+
}
301+
299302
return $options;
300303
}
301304

@@ -388,7 +391,7 @@ public function advance_to_next_stage() {
388391
protected $indexed_entities_counts = array();
389392
protected $indexed_assets_urls = array();
390393

391-
protected function index_next_entities( $count = 10000 ) {
394+
protected function index_next_entities() {
392395
if ( null !== $this->next_stage ) {
393396
return false;
394397
}
@@ -417,15 +420,15 @@ protected function index_next_entities( $count = 10000 ) {
417420
* Internalize the loop to avoid computing the reentrancy cursor
418421
* on every entity in the imported data stream.
419422
*/
420-
for ( $i = 0; $i < $count; ++$i ) {
423+
for ( $i = 0; $i < $this->options['index_batch_size']; ++$i ) {
421424
if ( ! $this->entity_iterator->valid() ) {
422425
break;
423426
}
424427
/**
425428
* Identify the static assets referenced in the current entity
426429
* and enqueue them for download.
427430
*/
428-
$entity = $this->entity_iterator->current();
431+
$entity = $this->get_current_entity();
429432

430433
$type = $entity->get_type();
431434

@@ -477,13 +480,17 @@ protected function index_next_entities( $count = 10000 ) {
477480
}
478481
} elseif ( isset( $data['post_content'] ) ) {
479482
$post = $data;
480-
$p = new BlockMarkupUrlProcessor( $post['post_content'], $this->source_site_url );
483+
$p = new BlockMarkupUrlProcessor( $post['post_content'], $this->get_post_base_url( $post ) );
484+
481485
while ( $p->next_url() ) {
482486
if ( ! $this->url_processor_matched_asset_url( $p ) ) {
483487
continue;
484488
}
485489
// @TODO: Consider using sha1 hashes to prevent huge URLs from blowing up the memory.
486-
$this->indexed_assets_urls[ $p->get_raw_url() ] = true;
490+
// @TODO: Use a consistent identifier for tracking download progress. Unfortunately,
491+
// $p->get_raw_url() does not line up with the resolved URL later on. The progress
492+
// events are emited with the full, resolved URL.
493+
$this->indexed_assets_urls[ $p->get_parsed_url().'' ] = true;
487494
}
488495
}
489496
break;
@@ -631,7 +638,7 @@ protected function frontload_next_entity() {
631638
* Identify the static assets referenced in the current entity
632639
* and enqueue them for download.
633640
*/
634-
$entity = $this->entity_iterator->current();
641+
$entity = $this->get_current_entity();
635642
$cursor = $this->entity_iterator->get_reentrancy_cursor();
636643
$this->active_downloads[ $cursor ] = array();
637644

@@ -655,7 +662,7 @@ protected function frontload_next_entity() {
655662
}
656663
} elseif ( isset( $data['post_content'] ) ) {
657664
$post = $data;
658-
$p = new BlockMarkupUrlProcessor( $post['post_content'], $this->source_site_url );
665+
$p = new BlockMarkupUrlProcessor( $post['post_content'], $this->get_post_base_url( $post ) );
659666
while ( $p->next_url() ) {
660667
if ( ! $this->url_processor_matched_asset_url( $p ) ) {
661668
continue;
@@ -678,6 +685,25 @@ protected function frontload_next_entity() {
678685
return true;
679686
}
680687

688+
protected function get_current_entity() {
689+
$entity = $this->entity_iterator->current();
690+
$entity = apply_filters( 'data_liberation.stream_importer.map_entity', $entity, [
691+
'importer' => $this,
692+
]);
693+
return $entity;
694+
}
695+
696+
protected function get_post_base_url( $post ) {
697+
return apply_filters(
698+
'data_liberation.stream_importer.post_base_url',
699+
$post['link'] ?? $this->source_site_url,
700+
[
701+
'post' => $post,
702+
'importer' => $this,
703+
]
704+
);
705+
}
706+
681707
/**
682708
* @TODO: Explore a way of making this idempotent. Maybe
683709
* use GUIDs to detect whether a post or an attachment
@@ -705,7 +731,7 @@ protected function import_next_entity() {
705731
return false;
706732
}
707733

708-
$entity = $this->entity_iterator->current();
734+
$entity = $this->get_current_entity();
709735

710736
$attachments = array();
711737
// Rewrite the URLs in the post.
@@ -736,7 +762,7 @@ protected function import_next_entity() {
736762
if ( ! isset( $data[ $key ] ) ) {
737763
continue;
738764
}
739-
$p = new BlockMarkupUrlProcessor( $data[ $key ], $this->source_site_url );
765+
$p = new BlockMarkupUrlProcessor( $data[ $key ], $this->get_post_base_url( $data ) );
740766
while ( $p->next_url() ) {
741767
// Relative URLs are okay at this stage.
742768
if ( ! $p->get_raw_url() ) {
@@ -755,7 +781,7 @@ protected function import_next_entity() {
755781
$data['local_file_path'] ?? $data['slug'] ?? null
756782
);
757783
if ( file_exists( $this->options['uploads_path'] . '/' . $asset_filename ) ) {
758-
$raw_url = $this->options['new_media_root_url'] . '/' . $asset_filename;
784+
$raw_url = rtrim( $this->options['new_media_root_url'], '/' ) . '/' . $asset_filename;
759785
$p->set_url(
760786
$raw_url,
761787
WPURL::parse( $raw_url )
@@ -795,10 +821,6 @@ protected function import_next_entity() {
795821
$entity->set_data( $data );
796822
break;
797823
}
798-
799-
$entity = apply_filters( 'data_liberation.stream_importer.map_entity', $entity, [
800-
'importer' => $this,
801-
]);
802824

803825
$post_id = $this->entity_sink->import_entity( $entity );
804826
if ( false !== $post_id ) {
@@ -913,7 +935,7 @@ protected function rewrite_attachment_url( string $raw_url, $context_path = null
913935
}
914936
$base_url = $this->source_site_url;
915937
if ( null !== $base_url && null !== $context_path ) {
916-
$base_url = $base_url . '/' . ltrim( $context_path, '/' );
938+
$base_url = rtrim( $base_url, '/' ) . '/' . ltrim( $context_path, '/' );
917939
}
918940
$parsed_url = WPURL::parse( $raw_url, $base_url );
919941
if ( false === $parsed_url ) {

components/DataLiberation/Tests/FilesystemEntityReaderTest.php

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ public function test_with_create_index_pages_true() {
1414
'create_index_pages' => true,
1515
'filter_pattern' => '#\.html$#',
1616
'index_file_pattern' => '#root.html#',
17+
'base_url' => 'https://example.com',
1718
)
1819
);
1920
$entities = array();
@@ -77,10 +78,11 @@ public function test_with_create_index_pages_true() {
7778

7879
public function test_uses_root_parent_id_as_top_level_parent() {
7980
$reader = new FilesystemEntityReader(
80-
LocalFilesystem::create( __DIR__ . '/fixtures/filesystem-entity-reader' ),
81+
LocalFilesystem::create( __DIR__ . '/fixtures/filesystem-entity-reader/simple-structure' ),
8182
array(
8283
'root_parent_id' => 2,
8384
'first_post_id' => 3,
85+
'base_url' => 'https://example.com',
8486
)
8587
);
8688
$entities = array();
@@ -98,18 +100,37 @@ public function test_uses_root_parent_id_as_top_level_parent() {
98100

99101
public function test_preserves_file_extension_in_the_post_name() {
100102
$reader = new FilesystemEntityReader(
101-
LocalFilesystem::create( __DIR__ . '/fixtures/filesystem-entity-reader' ),
103+
LocalFilesystem::create( __DIR__ . '/fixtures/filesystem-entity-reader/simple-structure' ),
102104
array(
103105
'first_post_id' => 2,
104106
'create_index_pages' => true,
105107
'filter_pattern' => '#\.html$#',
106108
'index_file_pattern' => '#root.html#',
109+
'base_url' => 'https://example.com',
107110
)
108111
);
109112
$entities = $this->get_post_entities( $reader );
110-
$this->assertEquals( 'root.html', $entities[0]['post_name'] );
111-
$this->assertEquals( 'nested', $entities[1]['post_name'] );
112-
$this->assertEquals( 'page1.html', $entities[2]['post_name'] );
113+
$this->assertEquals( 'https://example.com/root.html', $entities[0]['link'] );
114+
$this->assertEquals( 'https://example.com/nested', $entities[1]['link'] );
115+
$this->assertEquals( 'https://example.com/nested/page1.html', $entities[2]['link'] );
116+
}
117+
118+
public function test_leaves_out_directories_with_no_content() {
119+
$reader = new FilesystemEntityReader(
120+
LocalFilesystem::create( __DIR__ . '/fixtures/filesystem-entity-reader/with-nested-images-directory' ),
121+
array(
122+
'first_post_id' => 2,
123+
'create_index_pages' => true,
124+
'filter_pattern' => '#\.html$#',
125+
'index_file_pattern' => '#root.html#',
126+
'base_url' => 'https://example.com',
127+
)
128+
);
129+
$entities = $this->get_post_entities( $reader );
130+
$this->assertCount( 3, $entities );
131+
$this->assertEquals( 'https://example.com/root.html', $entities[0]['link'] );
132+
$this->assertEquals( 'https://example.com/nested', $entities[1]['link'] );
133+
$this->assertEquals( 'https://example.com/nested/page1.html', $entities[2]['link'] );
113134
}
114135

115136
private function get_post_entities($reader) {

components/DataLiberation/Tests/fixtures/filesystem-entity-reader/with-nested-images-directory/nested/images/screenshot.png

Loading
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
<h1>Page 1</h1>
2+
<p>This is page 1.</p>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
<h1>Root</h1>
2+
<p>This is the root page.</p>

components/DataLiberation/URL/WPURL.php

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,4 +73,11 @@ public static function has_double_slash( $raw_url ) {
7373
)
7474
);
7575
}
76+
77+
public static function append_path( $base_url, $path ) {
78+
$base_url = self::parse( $base_url );
79+
$base_url->pathname = rtrim( $base_url->pathname, '/' ) . '/' . ltrim( $path, '/' );
80+
return $base_url->toString();
81+
}
82+
7683
}

0 commit comments

Comments
 (0)