## datasitewalk.py ## author: Matthew Marquissee, NASA GSFC, Code 632 ## date started: June 26, 2003 ## last revision: July 29, 2003 ## purpose: This file provides a routine that will return ## a list of all files gathered under a specific remote ## directory (ftp) for further processing. Although optimized for ## the dataset finder, this module offers a generic structure for ## walking a remote site. Specify hierarchy with a string: ## i.e. '*_none' goes 1 level down, 'none' does not recurse (level 0), ## 'year_day_none' goes two levels down (year and day of year) import ftputil import ftplib import re, random import os.path from fnmatch import fnmatch from urlparse import urlparse import datetime # generic walk function def datasitewalk(full_path, divby='none', time1=datetime.datetime.min, time2=datetime.datetime.max): parts = urlparse(full_path) protocol, siteadd, basedir = parts[:3] if protocol == 'ftp': return ftpwalk(full_path, divby, time1, time2) ## elif protocol == 'http': ## return httpwalk(full_path, divby, time1, time2) else: return ['Only FTP is currently supported.'] # FTP layer skippats = ['.', '..'] def ftpwalk(full_path, divby='none', time1=datetime.datetime.min, time2=datetime.datetime.max): # parse the full_path parts = urlparse(full_path) protocol, siteadd, basedir = parts[:3] # hierarchy string into array divisions = re.split('_', divby) # anonymous login ftp = ftplib.FTP(siteadd) ftp.login() try: stats = ftp.voidcmd('STAT') except ftplib.error_perm, msg: stats = '' if stats.find('Microsoft') != -1: servertype = 'microsoft' elif stats.find('Mac') != -1: servertype = 'mac' elif stats.find('MultiNet FTP Server') != -1: servertype = 'vms' else: servertype = 'unix' try: ftp.cwd(basedir) except ftplib.error_perm, msg: return ['Data directory does not exist on remote server!'] result = walkftp(ftp, servertype, divisions, time1, time2) # now, prepend the rest of the URL to each element return map(lambda f_name: protocol + '://' + siteadd + f_name, result) def walkftp(f, servertype, divby, time1, time2): # initialize some structures results = [] pwd = f.pwd() # get directory listing of current directory subdirs, filesfound = getdirlisting(f, servertype) # if at bottom directory: add file name to results if divby[0] == 'none': results = map(lambda x: pwd + '/' + x, filesfound) # recurse the subdirectories for subdir in subdirs: # change directory try: f.cwd(pwd + '/' + subdir) except ftplib.error_perm, msg: print "Can't chdir to", `subdir`, ":", `msg` else: # add any time-based shortcuts for dataset finder here if divby[0] == 'year': if (time1.year - 1 <= int(subdir) <= time2.year): results = results + walkftp(f, servertype, divby[1:], time1, time2) else: pass elif divby[0] == 'none': pass else: results = results + walkftp(f, servertype, divby[1:], time1, time2) # we're done, go back return results def getdirlisting(ftp, type = 'unix'): """Parse an FTP LIST command into subdirectories and files.""" subdirs = [] files = [] info = dict() listing = [] ftp.retrlines('LIST', listing.append) for line in listing: if type == 'mac': # Mac listing has just filenames; # trailing / means subdirectory filename = line.strip() mode = '-' if filename[-1:] == '/': filename = filename[:-1] mode = 'd' infostuff = '' elif type == 'microsoft': mode = '-' words = line.split(None, 3) filename = words[-1].lstrip() if words[-2] == '