2
2
from multiprocessing .pool import ThreadPool
3
3
from typing import TYPE_CHECKING , Callable , Dict , List , Tuple
4
4
5
- from cycode .cli .consts import (
6
- SCAN_BATCH_MAX_FILES_COUNT ,
7
- SCAN_BATCH_MAX_PARALLEL_SCANS ,
8
- SCAN_BATCH_MAX_SIZE_IN_BYTES ,
9
- SCAN_BATCH_SCANS_PER_CPU ,
10
- )
5
+ from cycode .cli import consts
11
6
from cycode .cli .models import Document
12
7
from cycode .cli .utils .progress_bar import ScanProgressBarSection
13
8
18
13
19
14
def split_documents_into_batches (
20
15
documents : List [Document ],
21
- max_size_mb : int = SCAN_BATCH_MAX_SIZE_IN_BYTES ,
22
- max_files_count : int = SCAN_BATCH_MAX_FILES_COUNT ,
16
+ max_size : int = consts . DEFAULT_SCAN_BATCH_MAX_SIZE_IN_BYTES ,
17
+ max_files_count : int = consts . DEFAULT_SCAN_BATCH_MAX_FILES_COUNT ,
23
18
) -> List [List [Document ]]:
24
19
batches = []
25
20
@@ -28,7 +23,7 @@ def split_documents_into_batches(
28
23
for document in documents :
29
24
document_size = len (document .content .encode ('UTF-8' ))
30
25
31
- if (current_size + document_size > max_size_mb ) or (len (current_batch ) >= max_files_count ):
26
+ if (current_size + document_size > max_size ) or (len (current_batch ) >= max_files_count ):
32
27
batches .append (current_batch )
33
28
34
29
current_batch = [document ]
@@ -45,17 +40,18 @@ def split_documents_into_batches(
45
40
46
41
def _get_threads_count () -> int :
47
42
cpu_count = os .cpu_count () or 1
48
- return min (cpu_count * SCAN_BATCH_SCANS_PER_CPU , SCAN_BATCH_MAX_PARALLEL_SCANS )
43
+ return min (cpu_count * consts . SCAN_BATCH_SCANS_PER_CPU , consts . SCAN_BATCH_MAX_PARALLEL_SCANS )
49
44
50
45
51
46
def run_parallel_batched_scan (
52
47
scan_function : Callable [[List [Document ]], Tuple [str , 'CliError' , 'LocalScanResult' ]],
48
+ scan_type : str ,
53
49
documents : List [Document ],
54
50
progress_bar : 'BaseProgressBar' ,
55
- max_size_mb : int = SCAN_BATCH_MAX_SIZE_IN_BYTES ,
56
- max_files_count : int = SCAN_BATCH_MAX_FILES_COUNT ,
57
51
) -> Tuple [Dict [str , 'CliError' ], List ['LocalScanResult' ]]:
58
- batches = split_documents_into_batches (documents , max_size_mb , max_files_count )
52
+ max_size = consts .SCAN_BATCH_MAX_SIZE_IN_BYTES .get (scan_type , consts .DEFAULT_SCAN_BATCH_MAX_SIZE_IN_BYTES )
53
+ batches = split_documents_into_batches (documents , max_size )
54
+
59
55
progress_bar .set_section_length (ScanProgressBarSection .SCAN , len (batches )) # * 3
60
56
# TODO(MarshalX): we should multiply the count of batches in SCAN section because each batch has 3 steps:
61
57
# 1. scan creation
0 commit comments