import gzip
import urllib.request
import stat
-#from sortedcontainers import SortedDict
-#from sortedcontainers import SortedList
from collections import deque
from collections import OrderedDict
from datetime import datetime
import re
from heapq import heappush, heappop
import fcntl
+import logging
parser = argparse.ArgumentParser(description="mirror raspbian repo.")
parser.add_argument("baseurl", help="base url for source repo (e.g. https://archive.raspbian.org/ )",nargs='?')
parser.add_argument("mdurl", help="base url for mirrordirector or local source mirror (e.g. https://mirrordirector.raspbian.org/ )",nargs='?')
-parser.add_argument("hpurl", help="base url for last result hash pool (e.g. http://snapshot.raspbian.org/hashpool )",nargs='?')
+parser.add_argument("hpurl", help="base url for last result hash pool (e.g. https://snapshot.raspbian.org/hashpool )",nargs='?')
parser.add_argument("--internal", help=argparse.SUPPRESS) #base URL for private repo (internal use only)
parser.add_argument("--sourcepool", help="specify a source pool to look for packages in before downloading them (useful if maintaining multiple mirrors)",action='append')
lockfd = os.open('.',os.O_RDONLY)
fcntl.flock(lockfd,fcntl.LOCK_EX | fcntl.LOCK_NB)
+logpath = os.path.dirname(os.path.realpath(__file__))
+logging.basicConfig(filename=logpath+'/raspbmirror.log',format='%(asctime)s %(levelname)s: %(message)s', level=logging.DEBUG)
+
def addfilefromdebarchive(filestoverify,filequeue,filename,sha256,size):
size = int(size)
sha256andsize = [sha256,size,'M']
if filename in filestoverify:
if (sha256andsize[0:2] != filestoverify[filename][0:2]):
if stage == 'scanexisting':
- print('warning: same file with different hash/size during scanexisting phase old:'+repr(filestoverify[filename])+' new:'+repr(sha256andsize))
+ logging.warning('warning: same file with different hash/size during scanexisting phase old:'+repr(filestoverify[filename])+' new:'+repr(sha256andsize))
#find existing sha1/size of file on disk if it exists
if os.path.isfile(filename):
f = open(filename,'rb')
size = None
filestoverify[filename] = [sha256,size,'M']
else:
- print('error: same file with different hash/size during downloadnew phase old:'+repr(filestoverify[filename])+' new:'+repr(sha256andsize))
+ logging.error('error: same file with different hash/size during downloadnew phase old:'+repr(filestoverify[filename])+' new:'+repr(sha256andsize))
sys.exit(1)
else:
filestoverify[filename] = sha256andsize
def ensuresafepath(path):
pathsplit = path.split(b'/')
if path[0] == '/':
- print("path must be relative")
+ logging.info("path must be relative")
sys.exit(1)
for component in pathsplit:
if not pfnallowed.fullmatch(component):
- print("file name contains unexpected characters")
+ logging.info("file name contains unexpected characters")
sys.exit(1)
elif component[0] == '.':
- print("filenames starting with a dot are not allowed")
+ logging.info("filenames starting with a dot are not allowed")
sys.exit(1)
-
+
def geturl(fileurl):
with urllib.request.urlopen(fileurl.decode('ascii')) as response:
data = response.read()
def getfile(path,sha256,size):
ensuresafepath(path)
if not shaallowed.fullmatch(sha256):
- print('invalid character in sha256 hash')
+ logging.info('invalid character in sha256 hash')
sys.exit(1)
- #hashfn = b'../hashpool/' + sha256[:2] +b'/'+ sha256[:4] +b'/'+ sha256
- #if os.path.isfile(hashfn):
- # if os.path.getsize(hashfn) != size:
- # print('size mismatch on existing file in hash pool')
- # sys.exit(1)
- #else:
- # secondhashfn = None
- # if args.secondpool is not None:
- # secondhashfn = os.path.join(args.secondpool.encode('ascii'),sha256[:2] +b'/'+ sha256[:4] +b'/'+ sha256)
- # #print(secondhashfn)
- # if not os.path.isfile(secondhashfn):
- # secondhashfn = None
- # if secondhashfn is None:
- # else:
- # print('copying '+path.decode('ascii')+' with hash '+sha256.decode('ascii')+' from secondary pool')
- # f = open(secondhashfn,'rb')
- # data = f.read()
- # f.close()
- # ts = os.path.getmtime(secondhashfn)
- # sha256hash = hashlib.sha256(data)
- # sha256hashed = sha256hash.hexdigest().encode('ascii')
- # if (sha256 != sha256hashed):
- # #print(repr(filesize))
- # #print(repr(sha256))
- # #print(repr(sha256hashed))
- # print('hash mismatch while downloading file '+path.decode('ascii')+' '+sha256.decode('ascii')+' '+sha256hashed.decode('ascii'));
- # sys.exit(1)
- # if len(data) != size:
- # print('size mismatch while downloading file')
- # sys.exit(1)
- # hashdir = os.path.dirname(hashfn)
- # os.makedirs(hashdir,exist_ok=True)
- # f = open(hashfn,'wb')
- # f.write(data)
- # f.close()
- #
- # os.utime(hashfn,(ts,ts))
if len(os.path.dirname(path)) > 0:
os.makedirs(os.path.dirname(path),exist_ok=True)
if os.path.isfile(makenewpath(path)): # "new" file already exists, lets check the hash
fn = makenewpath(path)
sha256hashed, tl = getfilesha256andsize(fn)
if (sha256 == sha256hashed) and (size == tl):
- print('existing file '+path.decode('ascii')+' matched by hash and size')
+ logging.info('existing file '+path.decode('ascii')+' matched by hash and size')
fileupdates.add(path)
return # no download needed but rename is
- elif path in oldknownfiles:
+ elif path in oldknownfiles:
#shortcut exit if file is unchanged, we skip this if a "new" file was detected because
#that means some sort of update was going on to the file and may need to be finished/cleaned up.
oldsha256,oldsize,oldstatus = oldknownfiles[path]
if (size == os.path.getsize(path)): #no point reading the data and calculating a hash if the size does not match
sha256hashed, tl = getfilesha256andsize(path)
if (sha256 == sha256hashed) and (size == tl):
- print('existing file '+path.decode('ascii')+' matched by hash and size')
+ logging.info('existing file '+path.decode('ascii')+' matched by hash and size')
if os.path.isfile(makenewpath(path)):
#if file is up to date but a "new" file exists and is bad
#(we wouldn't have got this far if it was good)
outputpath = path
pathsplit = path.split(b'/')
if (pathsplit[1:2] == [b'pool']) and (args.debugskippool):
- print('skipping download of '+path.decode('ascii')+' because --debugskippool was specified')
+ logging.info('skipping download of '+path.decode('ascii')+' because --debugskippool was specified')
return
if (args.internal is not None) and (pathsplit[0] == b'raspbian'):
fileurl = args.internal.encode('ascii') +b'/private/' + b'/'.join(pathsplit[1:])
data = None
if args.sourcepool is not None:
for sourcepool in args.sourcepool:
- #print(repr(args.sourcepool))
- #print(repr(sourcepool))
sourcepool = sourcepool.encode('ascii')
if pathsplit[1] == b'pool':
spp = os.path.join(sourcepool,b'/'.join(pathsplit[2:]))
if os.path.isfile(spp) and (size == os.path.getsize(spp)):
- print('trying file from sourcepool '+spp.decode('ascii'))
+ logging.info('trying file from sourcepool '+spp.decode('ascii'))
ts = os.path.getmtime(spp)
f = open(spp,'rb')
data = f.read()
sha256hash = hashlib.sha256(data)
sha256hashed = sha256hash.hexdigest().encode('ascii')
if (sha256 != sha256hashed):
- #print(repr(filesize))
- #print(repr(sha256))
- #print(repr(sha256hashed))
- print('hash mismatch while trying file from sourcepool, ignoring file');
+ logging.info('hash mismatch while trying file from sourcepool, ignoring file');
data = None
continue
try:
os.link(spp,outputpath)
- print('successfully hardlinked file to source pool')
-
+ logging.info('successfully hardlinked file to source pool')
+
except:
- print('file in souce pool was good but hard linking failed, copying file instead')
+ logging.info('file in souce pool was good but hard linking failed, copying file instead')
fdownloads.write(outputpath+b'\n')
fdownloads.flush()
return
gzfile = makenewpath(path+b'.gz')
else:
gzfile = path+b'.gz'
- print('uncompressing '+gzfile.decode('ascii')+' with hash '+sha256.decode('ascii')+' to '+outputpath.decode('ascii'))
+ logging.info('uncompressing '+gzfile.decode('ascii')+' with hash '+sha256.decode('ascii')+' to '+outputpath.decode('ascii'))
f = gzip.open(gzfile)
data = f.read()
f.close()
if not checkdatahash(data, sha256, 'hash mismatch while uncompressing file ', path, ''):
sys.exit(1)
if len(data) != size:
- print('size mismatch while uncompressing file')
+ logging.info('size mismatch while uncompressing file')
sys.exit(1)
#use slicing so we don't error if pathsplit only has one item
if (data is None) and (mdurl is not None) and (pathsplit[1:2] == [b'pool']):
fileurl = mdurl + b'/' + path
- #fileurl = mdurl + b'/' + b'/'.join(pathsplit[1:])
data, ts = getandcheckfile(fileurl, sha256, size, path, outputpath, ' from mirrordirector',' trying main server instead')
if data is None:
data, ts = getandcheckfile(fileurl, sha256, size, path, outputpath, '','')
if data is None:
if (stage == 'downloadnew') and (b'dists' not in pathsplit):
- print('continuing dispite download failure of '+path.decode('ascii')+', may revisit later')
+ logging.info('continuing dispite download failure of '+path.decode('ascii')+', may revisit later')
global dlerrorcount
dlerrorcount += 1
knownfiles[path][2] = 'F'
fileurl = hpurl + b'/' + sha256[0:2] + b'/' + sha256[0:4] + b'/' + sha256
data, ts = getandcheckfile(fileurl, sha256, size, path, outputpath, '', '')
if data is None:
- print('failed to get '+path.decode('ascii')+' aborting')
+ logging.info('failed to get '+path.decode('ascii')+' aborting')
sys.exit(1)
if data is not ...: #... is used to indicate that the file has been downloaded directly to disk and we don't
# need to write it out here.
else:
writepath = outputpath
viamsg = ''
- print(
+ logging.info(
'downloading ' + fileurl.decode('ascii') + ' with hash ' + sha256.decode(
'ascii') + ' to ' + outputpath.decode(
'ascii') + viamsg)
errorsuffix):
data = None
elif tl != size:
- print('size mismatch while downloading file' + errorfromstr + '.' + errorsuffix)
+ logging.info('size mismatch while downloading file' + errorfromstr + '.' + errorsuffix)
data = None
except Exception as e:
- print('exception ' + str(e) + ' while downloading file' + errorfromstr + '.' + errorsuffix)
+ logging.info('exception ' + str(e) + ' while downloading file' + errorfromstr + '.' + errorsuffix)
if f is not None:
f.close()
data = None
def testandreporthash(sha256hash, sha256, errorprefix, path, errorsuffix):
sha256hashed = sha256hash.hexdigest().encode('ascii')
if (sha256 != sha256hashed):
- # print(repr(filesize))
- # print(repr(sha256))
- # print(repr(sha256hashed))
- print(errorprefix + path.decode('ascii') + ' ' + sha256.decode('ascii') + ' ' + sha256hashed.decode(
+ logging.info(errorprefix + path.decode('ascii') + ' ' + sha256.decode('ascii') + ' ' + sha256hashed.decode(
'ascii') + errorsuffix);
return False
return True
if args.baseurl is None:
baseurl = b'https://archive.raspbian.org'
mdurl = b'http://mirrordirector.raspbian.org'
- hpurl = b'http://snapshot.raspbian.org/hashpool'
+ hpurl = b'https://snapshot.raspbian.org/hashpool'
else:
baseurl = args.baseurl.encode('ascii')
-
-
-
symlinkupdates = list()
fileupdates = set()
def opengu(filepath):
- #print('in opengu')
- #print('filepath = '+repr(filepath))
- #print('fileupdates = '+repr(fileupdates))
f = None
if (filepath in fileupdates):
- print((b'opening '+makenewpath(filepath)+b' for '+filepath).decode('ascii'))
+ logging.info((b'opening '+makenewpath(filepath)+b' for '+filepath).decode('ascii'))
f = open(makenewpath(filepath),'rb')
elif (filepath+b'.gz' in fileupdates):
- print((b'opening '+makenewpath(filepath+b'.gz')+b' for '+filepath).decode('ascii'))
+ logging.info((b'opening '+makenewpath(filepath+b'.gz')+b' for '+filepath).decode('ascii'))
f = gzip.open(makenewpath(filepath+b'.gz'),'rb')
elif os.path.exists(filepath):
- print((b'opening '+filepath+b' for '+filepath).decode('ascii'))
+ logging.info((b'opening '+filepath+b' for '+filepath).decode('ascii'))
f = open(filepath,'rb')
elif os.path.exists(filepath+b'.gz'):
- print((b'opening '+filepath+b'.gz for '+filepath).decode('ascii'))
+ logging.info((b'opening '+filepath+b'.gz for '+filepath).decode('ascii'))
f = gzip.open(filepath+b'.gz','rb')
return f
for stage in ("scanexisting","downloadnew","finalize"):
if stage == "finalize":
if dlerrorcount == 0:
- print('skipping stage 3 as there were no download failures in stage 2')
+ logging.info('skipping stage 3 as there were no download failures in stage 2')
#we can finish now.
break
- print('stage 3, download final updates')
-
+ logging.info('stage 3, download final updates')
+
oldknownfiles = knownfiles
oldsymlinks |= newsymlinks
newsymlinks = set()
if stage == "downloadnew":
- print('stage 2, main download')
+ logging.info('stage 2, main download')
oldknownfiles = knownfiles
basefiles = set(oldknownfiles.keys())
if stage == "scanexisting":
- print('stage 1, scan existing')
+ logging.info('stage 1, scan existing')
else:
if args.internal is not None:
fileurl = args.internal.encode('ascii') + b'/snapshotindex.txt'
if (stage == "downloadnew") and (args.debugfif is not None):
fileurl = args.debugfif.encode('ascii')
- (filedata,ts) = geturl(fileurl)
+ (filedata,ts) = geturl(fileurl)
f = open(makenewpath(b'snapshotindex.txt'),'wb')
if (args.tlwhitelist is None) and (args.distswhitelist is None):
for line in lines:
path, sizeandsha = line.split(b' ')
pathsplit = path.split(b'/')
- #print(pathsplit)
- #print(len(pathsplit))
if (len(pathsplit) > 2) and (pathsplit[1] == b'dists'):
if sizeandsha[0:2] == b'->': #symlink
target = sizeandsha[2:]
pass
else:
linesnew.append(line)
-
+
lines = linesnew
if founddists == set():
- print('none of the whitelisted distributions were found in the index file')
+ logging.info('none of the whitelisted distributions were found in the index file')
sys.exit(1)
missingesdists = founddists - foundesdists
if missingesdists != set():
for toplevel,distribution in missingesdists:
- print((b'missing extra sources file for '+toplevel+b'/dists/'+distribution).decode('ascii'))
+ logging.info((b'missing extra sources file for '+toplevel+b'/dists/'+distribution).decode('ascii'))
sys.exit(1)
for line in lines:
f.write(line+b'\n')
if os.readlink(filepath) != symlinktarget:
symlinkupdates.append((filepath,symlinktarget))
else:
- print('creating symlink '+filepath.decode('ascii')+' -> '+symlinktarget.decode('ascii'))
+ logging.info('creating symlink '+filepath.decode('ascii')+' -> '+symlinktarget.decode('ascii'))
os.symlink(symlinktarget,filepath)
newsymlinks.add(filepath)
else:
extrasources = {}
while filequeue:
(priority, filepath) = heappop(filequeue)
- #print('processing '+filepath.decode('ascii'))
sha256,size,status = knownfiles[filepath]
if (stage != "scanexisting") and ((filepath+b'.gz' not in knownfiles) or (status == 'R') or os.path.exists(filepath)):
getfile(filepath,sha256,size)
pathsplit = filepath.split(b'/')
- #print(pathsplit[-1])
- #if (pathsplit[-1] == b'Packages'):
- # print(repr(pathsplit))
if (pathsplit[-1] == b'Release') and (pathsplit[-3] == b'dists'):
distdir = b'/'.join(pathsplit[:-1])
f = opengu(filepath)
if f is None:
if stage == 'scanexisting':
- print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
+ logging.warning('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
continue
else:
- print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
+ logging.error('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
sys.exit(1)
insha256 = False;
for line in f:
toplevel = b'/'.join(pathsplit[:-5])
else:
toplevel = b'/'.join(pathsplit[:-6])
- print('found packages file: '+filepath.decode('ascii'))
+ logging.info('found packages file: '+filepath.decode('ascii'))
pf = opengu(filepath)
if pf is None:
if stage == 'scanexisting':
- print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
+ logging.warning('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
continue
else:
- print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
+ logging.error('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
sys.exit(1)
filename = None
size = None
sha256 = None
-
+
for line in pf:
linesplit = line.split()
if (len(linesplit) == 0):
sha256 = linesplit[1]
pf.close()
elif (pathsplit[-1] == b'Sources') and (pathsplit[-5] == b'dists'):
- print('found sources file: '+filepath.decode('ascii'))
+ logging.info('found sources file: '+filepath.decode('ascii'))
toplevel = b'/'.join(pathsplit[:-5])
pf = opengu(filepath)
if pf is None:
if stage == 'scanexisting':
- print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
+ logging.warning('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
continue
else:
- print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
+ logging.error('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
sys.exit(1)
filesfound = [];
directory = None
insha256p = False
pf.close()
elif (args.distswhitelist is not None) and (pathsplit[-1] == b'extrasources') and (pathsplit[-3] == b'dists'):
- print('found extrasources file: '+filepath.decode('ascii'))
+ logging.info('found extrasources file: '+filepath.decode('ascii'))
esf = opengu(filepath)
if esf is None:
if stage == 'scanexisting':
- print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
+ logging.warning('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
continue
else:
- print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
+ logging.error('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
sys.exit(1)
for line in esf:
line = line.strip()
size , sha256 = shaandsize.split(b':')
addfilefromdebarchive(knownfiles,filequeue,filename,sha256,size)
extrasources[filename] = shaandsize
- #print(line)
fdownloads.close()
fdownloads = open(makenewpath(b'raspbmirrordownloads.txt'),"rb")
for (dirpath, dirnames, filenames) in towalk:
for filename in (filenames + dirnames): # os.walk seems to regard symlinks to directories as directories.
filepath = os.path.join(dirpath, filename)[2:].encode('ascii') # [2:] is to strip the ./ prefix
- # print(filepath)
if os.path.islink(filepath):
oldsymlinks.add(filepath)
for filename in filenames:
if not os.path.islink(filepath) and not filepath.startswith(b'snapshotindex.txt') and not filepath.startswith(b'raspbmirrordownloads.txt'):
basefiles.add(filepath)
-print('stage 4, moves and deletions')
+logging.info('stage 4, moves and deletions')
for filepath in fileupdates:
- print((b'renaming '+makenewpath(filepath)+b' to '+filepath).decode('ascii'))
+ logging.info((b'renaming '+makenewpath(filepath)+b' to '+filepath).decode('ascii'))
os.replace(makenewpath(filepath),filepath)
for (filepath,symlinktarget) in symlinkupdates:
- print('updating symlink '+filepath.decode('ascii')+' -> '+symlinktarget.decode('ascii'))
+ logging.info('updating symlink '+filepath.decode('ascii')+' -> '+symlinktarget.decode('ascii'))
os.remove(filepath)
os.symlink(symlinktarget,filepath)
#file may not actually exist, either due to earlier updates gone-wrong
#or due to the file being a non-realised uncompressed version of
#a gzipped file.
- if os.path.exists(filepath):
+ if os.path.exists(filepath):
ensuresafepath(filepath)
- print('removing '+filepath.decode('ascii'))
+ logging.info('removing '+filepath.decode('ascii'))
os.remove(filepath)
#clean up empty directories.
dirpath = os.path.dirname(filepath)
while (len(dirpath) != 0) and isemptydir(dirpath):
- print('removing empty dir '+dirpath.decode('ascii'))
+ logging.info('removing empty dir '+dirpath.decode('ascii'))
os.rmdir(dirpath)
dirpath = os.path.dirname(dirpath)
os.rename(makenewpath(b'snapshotindex.txt'),b'snapshotindex.txt')
os.remove(makenewpath(b'raspbmirrordownloads.txt'))
-