4
4
5
5
using System . Collections . Concurrent ;
6
6
using System . Diagnostics . CodeAnalysis ;
7
+ using System . IO . Abstractions ;
7
8
using System . Security . Cryptography ;
8
9
using Amazon . S3 ;
9
10
using Amazon . S3 . Model ;
10
11
using Microsoft . Extensions . Logging ;
11
12
12
13
namespace Documentation . Assembler . Deploying ;
13
14
14
- public class AwsS3SyncPlanStrategy ( ILoggerFactory logFactory , IAmazonS3 s3Client , string bucketName , AssembleContext context ) : IDocsSyncPlanStrategy
15
+ public interface IS3EtagCalculator
16
+ {
17
+ Task < string > CalculateS3ETag ( string filePath , Cancel ctx = default ) ;
18
+ }
19
+
20
+ public class S3EtagCalculator ( ILoggerFactory logFactory , IFileSystem readFileSystem ) : IS3EtagCalculator
15
21
{
16
- internal const long PartSize = 5 * 1024 * 1024 ; // 5MB
17
22
private readonly ILogger < AwsS3SyncPlanStrategy > _logger = logFactory . CreateLogger < AwsS3SyncPlanStrategy > ( ) ;
23
+
18
24
private static readonly ConcurrentDictionary < string , string > EtagCache = new ( ) ;
19
25
26
+ internal const long PartSize = 5 * 1024 * 1024 ; // 5MB
27
+
28
+ [ SuppressMessage ( "Security" , "CA5351:Do Not Use Broken Cryptographic Algorithms" ) ]
29
+ public async Task < string > CalculateS3ETag ( string filePath , Cancel ctx = default )
30
+ {
31
+ if ( EtagCache . TryGetValue ( filePath , out var cachedEtag ) )
32
+ {
33
+ _logger . LogDebug ( "Using cached ETag for {Path}" , filePath ) ;
34
+ return cachedEtag ;
35
+ }
36
+
37
+ var fileInfo = readFileSystem . FileInfo . New ( filePath ) ;
38
+ var fileSize = fileInfo . Length ;
39
+
40
+ // For files under 5MB, use simple MD5 (matching TransferUtility behavior)
41
+ if ( fileSize <= PartSize )
42
+ {
43
+ await using var stream = readFileSystem . FileStream . New ( filePath , FileMode . Open , FileAccess . Read , FileShare . Read ) ;
44
+ var smallBuffer = new byte [ fileSize ] ;
45
+ var bytesRead = await stream . ReadAsync ( smallBuffer . AsMemory ( 0 , ( int ) fileSize ) , ctx ) ;
46
+ var hash = MD5 . HashData ( smallBuffer . AsSpan ( 0 , bytesRead ) ) ;
47
+ var etag = Convert . ToHexStringLower ( hash ) ;
48
+ EtagCache [ filePath ] = etag ;
49
+ return etag ;
50
+ }
51
+
52
+ // For files over 5MB, use multipart format with 5MB parts (matching TransferUtility)
53
+ var parts = ( int ) Math . Ceiling ( ( double ) fileSize / PartSize ) ;
54
+
55
+ await using var fileStream = readFileSystem . FileStream . New ( filePath , FileMode . Open , FileAccess . Read , FileShare . Read ) ;
56
+ var partBuffer = new byte [ PartSize ] ;
57
+ var partHashes = new List < byte [ ] > ( ) ;
58
+
59
+ for ( var i = 0 ; i < parts ; i ++ )
60
+ {
61
+ var bytesRead = await fileStream . ReadAsync ( partBuffer . AsMemory ( 0 , partBuffer . Length ) , ctx ) ;
62
+ var partHash = MD5 . HashData ( partBuffer . AsSpan ( 0 , bytesRead ) ) ;
63
+ partHashes . Add ( partHash ) ;
64
+ }
65
+
66
+ // Concatenate all part hashes
67
+ var concatenatedHashes = partHashes . SelectMany ( h => h ) . ToArray ( ) ;
68
+ var finalHash = MD5 . HashData ( concatenatedHashes ) ;
69
+
70
+ var multipartEtag = $ "{ Convert . ToHexStringLower ( finalHash ) } -{ parts } ";
71
+ EtagCache [ filePath ] = multipartEtag ;
72
+ return multipartEtag ;
73
+ }
74
+ }
75
+
76
+ public class AwsS3SyncPlanStrategy (
77
+ ILoggerFactory logFactory ,
78
+ IAmazonS3 s3Client ,
79
+ string bucketName ,
80
+ AssembleContext context ,
81
+ IS3EtagCalculator ? calculator = null
82
+ )
83
+ : IDocsSyncPlanStrategy
84
+ {
85
+ private readonly ILogger < AwsS3SyncPlanStrategy > _logger = logFactory . CreateLogger < AwsS3SyncPlanStrategy > ( ) ;
86
+
87
+ private readonly IS3EtagCalculator _s3EtagCalculator = calculator ?? new S3EtagCalculator ( logFactory , context . ReadFileSystem ) ;
88
+
20
89
private bool IsSymlink ( string path )
21
90
{
22
91
var fileInfo = context . ReadFileSystem . FileInfo . New ( path ) ;
@@ -42,7 +111,7 @@ await Parallel.ForEachAsync(localObjects, ctx, async (localFile, token) =>
42
111
if ( remoteObjects . TryGetValue ( destinationPath , out var remoteObject ) )
43
112
{
44
113
// Check if the ETag differs for updates
45
- var localETag = await CalculateS3ETag ( localFile . FullName , token ) ;
114
+ var localETag = await _s3EtagCalculator . CalculateS3ETag ( localFile . FullName , token ) ;
46
115
var remoteETag = remoteObject . ETag . Trim ( '"' ) ; // Remove quotes from remote ETag
47
116
if ( localETag == remoteETag )
48
117
{
@@ -89,14 +158,44 @@ await Parallel.ForEachAsync(localObjects, ctx, async (localFile, token) =>
89
158
90
159
return new SyncPlan
91
160
{
161
+ TotalSourceFiles = localObjects . Length ,
92
162
DeleteRequests = deleteRequests . ToList ( ) ,
93
163
AddRequests = addRequests . ToList ( ) ,
94
164
UpdateRequests = updateRequests . ToList ( ) ,
95
165
SkipRequests = skipRequests . ToList ( ) ,
96
- Count = deleteRequests . Count + addRequests . Count + updateRequests . Count + skipRequests . Count
166
+ TotalSyncRequests = deleteRequests . Count + addRequests . Count + updateRequests . Count + skipRequests . Count
97
167
} ;
98
168
}
99
169
170
+ /// <inheritdoc />
171
+ public PlanValidationResult Validate ( SyncPlan plan , float deleteThreshold )
172
+ {
173
+ if ( plan . TotalSourceFiles == 0 )
174
+ {
175
+ _logger . LogError ( "No files to sync" ) ;
176
+ return new ( false , 1.0f , deleteThreshold ) ;
177
+ }
178
+
179
+ var deleteRatio = ( float ) plan . DeleteRequests . Count / plan . TotalSyncRequests ;
180
+ // if the total sync requests are less than 100, we enforce a higher ratio of 0.8
181
+ // this allows newer assembled documentation to be in a higher state of flux
182
+ if ( plan . TotalSyncRequests <= 100 )
183
+ deleteThreshold = Math . Max ( deleteThreshold , 0.8f ) ;
184
+
185
+ // if the total sync requests are less than 1000, we enforce a higher ratio of 0.5
186
+ // this allows newer assembled documentation to be in a higher state of flux
187
+ else if ( plan . TotalSyncRequests <= 1000 )
188
+ deleteThreshold = Math . Max ( deleteThreshold , 0.5f ) ;
189
+
190
+ if ( deleteRatio > deleteThreshold )
191
+ {
192
+ _logger . LogError ( "Delete ratio is {Ratio} which is greater than the threshold of {Threshold}" , deleteRatio , deleteThreshold ) ;
193
+ return new ( false , deleteRatio , deleteThreshold ) ;
194
+ }
195
+
196
+ return new ( true , deleteRatio , deleteThreshold ) ;
197
+ }
198
+
100
199
private async Task < Dictionary < string , S3Object > > ListObjects ( Cancel ctx = default )
101
200
{
102
201
var listBucketRequest = new ListObjectsV2Request
@@ -117,51 +216,4 @@ private async Task<Dictionary<string, S3Object>> ListObjects(Cancel ctx = defaul
117
216
118
217
return objects . ToDictionary ( o => o . Key ) ;
119
218
}
120
-
121
- [ SuppressMessage ( "Security" , "CA5351:Do Not Use Broken Cryptographic Algorithms" ) ]
122
- private async Task < string > CalculateS3ETag ( string filePath , Cancel ctx = default )
123
- {
124
- if ( EtagCache . TryGetValue ( filePath , out var cachedEtag ) )
125
- {
126
- _logger . LogDebug ( "Using cached ETag for {Path}" , filePath ) ;
127
- return cachedEtag ;
128
- }
129
-
130
- var fileInfo = context . ReadFileSystem . FileInfo . New ( filePath ) ;
131
- var fileSize = fileInfo . Length ;
132
-
133
- // For files under 5MB, use simple MD5 (matching TransferUtility behavior)
134
- if ( fileSize <= PartSize )
135
- {
136
- await using var stream = context . ReadFileSystem . FileStream . New ( filePath , FileMode . Open , FileAccess . Read , FileShare . Read ) ;
137
- var smallBuffer = new byte [ fileSize ] ;
138
- var bytesRead = await stream . ReadAsync ( smallBuffer . AsMemory ( 0 , ( int ) fileSize ) , ctx ) ;
139
- var hash = MD5 . HashData ( smallBuffer . AsSpan ( 0 , bytesRead ) ) ;
140
- var etag = Convert . ToHexStringLower ( hash ) ;
141
- EtagCache [ filePath ] = etag ;
142
- return etag ;
143
- }
144
-
145
- // For files over 5MB, use multipart format with 5MB parts (matching TransferUtility)
146
- var parts = ( int ) Math . Ceiling ( ( double ) fileSize / PartSize ) ;
147
-
148
- await using var fileStream = context . ReadFileSystem . FileStream . New ( filePath , FileMode . Open , FileAccess . Read , FileShare . Read ) ;
149
- var partBuffer = new byte [ PartSize ] ;
150
- var partHashes = new List < byte [ ] > ( ) ;
151
-
152
- for ( var i = 0 ; i < parts ; i ++ )
153
- {
154
- var bytesRead = await fileStream . ReadAsync ( partBuffer . AsMemory ( 0 , partBuffer . Length ) , ctx ) ;
155
- var partHash = MD5 . HashData ( partBuffer . AsSpan ( 0 , bytesRead ) ) ;
156
- partHashes . Add ( partHash ) ;
157
- }
158
-
159
- // Concatenate all part hashes
160
- var concatenatedHashes = partHashes . SelectMany ( h => h ) . ToArray ( ) ;
161
- var finalHash = MD5 . HashData ( concatenatedHashes ) ;
162
-
163
- var multipartEtag = $ "{ Convert . ToHexStringLower ( finalHash ) } -{ parts } ";
164
- EtagCache [ filePath ] = multipartEtag ;
165
- return multipartEtag ;
166
- }
167
219
}
0 commit comments