diff --git a/.gitignore b/.gitignore index 2b0d552..36fd670 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,5 @@ *.csv *.log *.json -*.new ignore/ +__pycache__/ diff --git a/README.md b/README.md index 64e81a7..90e95b0 100644 --- a/README.md +++ b/README.md @@ -20,46 +20,50 @@ sub-folders in preparation for DRL to perform their scanning processes. Spreadsheet Columns: - ----------------------------------------------------------------------- - **Required - Columns** - ------------------- --------------------------------------------------- - 'id' This is the PID of the object. Must currently be - the first column of the sheet. + |Required Columns || + |------------------- |---------------------------------------------------| + |'id' |This is the PID of the object. Must currently be the first column of the sheet.| - **Optional - Columns** + |Optional Columns || + |------------------- |---------------------------------------------------| + |Any |Any additional columns as needed.| - Any Any additional columns as needed. - ----------------------------------------------------------------------- +## Script Parameters: -Script Parameters: + |Parameter |Description| + |:--- |:--- | + |\--config-file |Path to the script config file containing paths to the workbench directory (workbench_path), the scanning directory (scanning_path), and the path to the python executable.| + |\--xls-file |Path to the spreadsheet to be processed.| + |\--batch-name |The Name of the batch that will be created.| + |\--log-file |Path to the log file.| + |\--use-google |Set this to true if using Google Sheets.| + |\--google-sheet-id |The Google Sheet Identifier.| + |\--google-sheet-name|The Google Sheet Tab Name. This defaults to 'Sheet1' if not included.| + |\--google-creds-file|The file containing the Google credentials file.| - ----------------------------------------------------------------------- - Parameter Description - ------------------- --------------------------------------------------- - \--config_file Path to the script config file containing paths to - the workbench directory (workbench_path), the - scanning directory (scanning_path), and the path to - the python executable. +## Config File requirements: - \--xls-file Path to the spreadsheet to be processed. +The config file contains a single option "scanning_path" that points the script to where you would like to build the directory structure for the batch-name you are passing in to the script. This is the top level folder usually where you would store all your batches. In our example, we would set "scanning_path" to "/scanning" as in the following. + +``` +scaning_path: /scanning +``` + +An example config file can be found in the make-batch-dirs.conf-sample file. - \--batch-name The Name of the batch that will be created. - ----------------------------------------------------------------------- ## Usage: -Script Usage Example: +Script Usage Examples: + |Type|Example| + |:--- |:--- | + |Spreadsheet|make-batch-dirs --config_file config.conf --xls-file input_spreadsheet.xls --log-file log.txt --batch-name MyNewBatch| + |Google Sheet|make-batch-dirs --config_file config.conf --log-file log.txt --batch-name MyNewBatch --use_google {true\\|false} --google-sheet-id {sheet id} --google-sheet-name {E.g. 'Sheet1'} --google-creds-file {path to credentials file.}| -make-batch-dirs ---config_file config.conf ---xls-file -input_spreadsheet.xls ---batch-name MyNewBatch ## Function: -For each row in the spreadsheet, obtain the first column contents which -should be the 'id' column and construct a new directory structure in the -format of {scanning_path}/{batch-name}/{id}. +For each row in the spreadsheet, obtain the 'id' column contents and construct a new directory structure in the format of {scanning_path}/{batch-name}/{id}. E.g. Result: /scanning/MyNewBatch/317350000001, /scanning/MyNewBatch/317350000002, /scanning/MyNewBatch/317350000003 diff --git a/make-batch-dirs b/make-batch-dirs index 32bfc6a..ca2c1ad 100755 --- a/make-batch-dirs +++ b/make-batch-dirs @@ -7,85 +7,204 @@ import json import sys import subprocess import shutil +import sheetutils import logging import openpyxl import csv +import pandas as pd# for type hints -def get_username(): +# Setup the log file format. +log_formatter = logging.Formatter(fmt='%(asctime)s.%(msecs)03d %(levelname)s %(message)s',datefmt="%Y%m%d %H:%M:%S") + +def get_username() -> str: + """fetch username of user running script""" return pwd.getpwuid(os.getuid())[0] -def setup_logger(name, log_file, level=logging.DEBUG): +def setup_logger(name:str, log_file:str, level=logging.DEBUG): """To setup as many loggers as needed""" - handler = logging.FileHandler(log_file) - handler.setFormatter(log_formatter) + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter(log_formatter) + console_handler = logging.StreamHandler(sys.stdout) logger = logging.getLogger(name) logger.setLevel(level) - logger.addHandler(handler) + logger.addHandler(file_handler) + logger.addHandler(console_handler) return logger -def read_yaml_file(path): +def read_yaml_file(path: str) -> dict: + """ + read yaml file. + Note: get_data coerces yaml to most appropriate type. + Most of the time this is dict, but might be list or str possibly + """ with open(path, "r") as stream: + # FullLoader allows yaml to execute arbitrary python + # so script users are assumed to be trusted return yaml.FullLoader(stream).get_data() -parser = argparse.ArgumentParser(description='Run..') -parser.add_argument('--config-file', dest="config_file", required=True, help='Config file.') -parser.add_argument('--xls-file', dest="xls_file", required=True, help='XLS file.') -parser.add_argument('--batch-name', dest="batch_name", required=False, help='Name of the batch.') -args = parser.parse_args() - -# Set configuration variables from config-file parameter -username = get_username() -cfg = read_yaml_file(args.config_file) -workbench_path = cfg['workbench_path'] -scanning_path = cfg['scanning_path'] -python_exe = cfg['python_exe'] - -if ( args.batch_name ): - batch_name = args.batch_name -else: - batch_name = os.path.splitext(os.path.basename(args.xls_file))[0] - -batch_path = scanning_path+"/"+batch_name - -print(f"Creating Batch Path: {batch_path}") -if ( os.path.isdir(batch_path) ): - print(f"Error: {batch_path} exists!") - exit() -else: - os.mkdir(batch_path) - -print(f"Copying spreadsheet to {batch_path}/manifest.xlsx") -shutil.copyfile(args.xls_file, batch_path+"/manifest.xlsx") - -print(f"Creating spreadsheet as csv") -wb = openpyxl.load_workbook(args.xls_file) -sheetnames = wb.sheetnames -sheet_value_arr = [] -for a in sheetnames: - sheet = wb[a] - with open(batch_path+"/manifest.csv", "w") as f: - c = csv.writer(f) - for row in sheet.rows: - sheet_value_arr.append([cell.value for cell in row]) - #for r in sheet.rows: - c.writerow([cell.value for cell in row]) -f.close() - -print(f"Reading spreadsheet: {args.xls_file}") -workbook = openpyxl.load_workbook(args.xls_file) -dataframe = workbook.active - -rows = dataframe.iter_rows() -next(rows) -for row in rows: - if ((str(row[0].value)) and (str(row[0].value) != "None" )): - id = str(row[0].value) - print(f"Creating {batch_path}/{id}") - object_path = batch_path + "/" + id - try: - os.mkdir(object_path) - except OSError as error: - print(f"Warning: {batch_path}/{id} - {error}.") - -print(f"Batch Path Creation Complete.") +def str_to_bool(value: str) -> bool: + """ + take common 'yes' and 'no' nouns and converts them to boolean + + error: raises ArgumentTypeError when noun not found within expected nouns + """ + value = value.strip() + if value.lower() in {'true','t','yes','y','1'}: + return True + elif value.lower() in {'false','f','no','n','0'}: + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected') + +def create_batch_folder(scanning_path:str, batch_name:str): + """ + create folder {scanning_path}/{batch_name} in cwd + + error: if path exists, then logs error and exits + """ + batch_path = os.path.sep.join([scanning_path,batch_name]) + logger.info(f"Creating Batch Path: {batch_path}") + if ( os.path.isdir(batch_path) ): + logger.error(f"Error: {batch_path} exists") + exit(1) + else: + os.mkdir(batch_path) + return batch_path + +def copy_xslx_to_batch(batch_path:str): + """ + copy manifest.xslx into batch_path created folder + + error: if manifest.xslx does not exist, then log and fail + """ + manifest_path = os.path.sep.join([batch_path,'manifest.xlsx']) + if not os.path.isfile(manifest_path): + logger.info(f"Copying spreadsheet to {manifest_path}") + shutil.copyfile(args.xls_file, manifest_path) + else: + logger.error(f"Error: {manifest_path} already exists.") + exit(1) + +def save_xslx_as_csv(batch_path:str): + """ + save args.xls_file into batch_path/manifest.csv + """ + logger.info(f"Creating spreadsheet as csv") + wb = openpyxl.load_workbook(args.xls_file) + sheetnames = wb.sheetnames + sheet_value_arr = [] + manifest_csv = os.path.sep.join([batch_path,'manifest.csv']) + for a in sheetnames: + sheet = wb[a] + with open(manifest_csv, "w") as f: + c = csv.writer(f) + for row in sheet.rows: + sheet_value_arr.append([cell.value for cell in row]) + #for r in sheet.rows: + c.writerow([cell.value for cell in row]) + +def make_dirs_from_df(df:pd.DataFrame, batch_path): + """ + read id field of dataframe, and create batch_path/id + for non empty and non None ids. + + error: log if mkdir fails. log and exit if id not in + dataframe's columns or dataframe empty + """ + # Make sure the df has an 'id' column and data rows + if 'id' not in df.columns: + logger.error(f"Column 'id' does not exist") + exit(1) + # Make sure the df has rows besides the header row. + if len(df) == 0: + logger.error("Sheet contains no data") + exit(1) + rows = df.iterrows() + for _, row in rows: + if (str(row.loc['id']) and (str(row.loc['id']) != "None" )): + id = str(row.loc['id']) + logger.info(f"Creating {batch_path}/{id}") + object_path = os.path.sep.join([batch_path,id]) + try: + os.mkdir(object_path) + except OSError as error: + logger.warning(f"Warning: {object_path} - {error}.") + +# +# Main function. +# +def main(): + """ + This the main function. + """ + # Parse command line arguements + parser = argparse.ArgumentParser(description='Run..') + parser.add_argument('--config-file', dest="config_file", required=True, help='Config file.') + parser.add_argument('--xls-file', dest="xls_file", required=False, help='XLS file.') + parser.add_argument('--batch-name', dest="batch_name", required=True, help='Name of the batch.') + parser.add_argument('--log-file', dest="log_file", required=False, help='Log file.') + parser.add_argument('--use-google', dest='use_google', type=str_to_bool, required=False, default=False, help='Use Google Sheet.') + parser.add_argument('--google-sheet-id', dest='google_sheet_id', required=False) + parser.add_argument('--google-sheet-name', dest='google_sheet_name', default="Sheet1", required=False) + parser.add_argument('--google-creds-file', dest='google_sheet_creds', required=False) + + # Make args a global variable + global args + args = parser.parse_args() + + # Set configuration variables from config-file parameter + username = get_username() + cfg = read_yaml_file(args.config_file) + scanning_path = cfg['scanning_path'] + + # Setup Log file. + global logger + if ( args.log_file ): + logger = setup_logger("log",args.log_file) + else: + logger = setup_logger("log","log.txt") + logger.info(f"Log file created.") + + # Setup the batch_name + if ( args.batch_name ): + batch_name = args.batch_name + else: + batch_name = os.path.splitext(os.path.basename(args.xls_file))[0] + logger.info(f"Batch name: {batch_name}") + + # If we are using google sheets... + if args.use_google: + if not args.google_sheet_creds: + logger.error(f"Error: --google-sheet-creds is required.") + if not args.google_sheet_id: + logger.error(f"Error: --google-sheet-id is required.") + if not args.google_sheet_name: + logger.error(f"Error: --google-sheet-name is required.") + if args.google_sheet_creds and args.google_sheet_id and args.google_sheet_name and os.path.isfile(args.google_sheet_creds): + logger.info(f"Using Google Sheet: {args.google_sheet_id},{args.google_sheet_name}") + manager = sheetutils.GoogleSheetManager() + manager.connect(args.google_sheet_creds) + sheet = manager.sheet(args.google_sheet_id, args.google_sheet_name) + df = sheet.read() + batch_path = create_batch_folder(scanning_path, batch_name) + make_dirs_from_df(df,batch_path) + else: + logger.error(f"Error: Google arguments are required when using Google Sheets.") + exit() + # Else we are using a Spreadsheet... + else: + logger.info(f"Using Spreadsheet: {args.xls_file}") + logger.info(f"Creating Batch folder: {scanning_path}/{batch_name}") + df = pd.read_excel(args.xls_file) + batch_path = create_batch_folder(scanning_path, batch_name) + make_dirs_from_df(df,batch_path) + logger.info(f"Storing local copy of xlsx file and creating csv file from Google Sheet.") + copy_xslx_to_batch(batch_path) + save_xslx_as_csv(batch_path) + + logger.info(f"Batch Path Creation Complete.") + +# Main call. +if __name__ == "__main__": + main() diff --git a/make-batch-dirs.conf_sample b/make-batch-dirs.conf_sample index 01d6dbd..adff3b7 100644 --- a/make-batch-dirs.conf_sample +++ b/make-batch-dirs.conf_sample @@ -2,11 +2,6 @@ # make-batch-dirs config. # -# workbench_path - where is your workbench directory. -workbench_path: - # scanning_path - where is your scanning directory. scanning_path: -# python_exe - where is your python executable. -python_exe: /usr/bin/python3 diff --git a/sheetutils.py b/sheetutils.py new file mode 100644 index 0000000..e3a5df0 --- /dev/null +++ b/sheetutils.py @@ -0,0 +1,139 @@ +from googleapiclient.discovery import build, Resource +from google.oauth2 import service_account +import os +import pandas as pd +from typing import TypeAlias, Any +import logging + +logger = logging # use default logger + +class GoogleSheet: + """ + represents a single spreadsheet (i.e. a single tab in sheets) + and allows read and update operations + """ + def __init__(self, sheet_obj: Resource): + self.sheet = sheet_obj + + def read(self) -> pd.DataFrame: + """ + read spreadsheet from service resource object into dataframe + """ + sheet = self.sheet + data = sheet.get('values', []) + + if not data: + logger.warn(f"read_google_sheet - No data found in the specified worksheet.") + + # Return empty DataFrame + return pd.DataFrame() + + else: + logger.info(f"read_google_sheet - Read of Google Sheet Successful.") + + # Convert to DataFrame + # First row as headers, rest as data + headers = data[0] + rows = data[1:] if len(data) > 1 else [] + + # Pad rows with fewer columns with fill_value + fill_value = None + max_columns = len(headers) + padded_rows = [row + [fill_value] * (max_columns - len(row)) for row in rows] + + # Create DataFrame + df = pd.DataFrame(padded_rows, columns=headers) + + return df + + def update(self, df: pd.DataFrame) -> tuple[bool, str]: + """ + write contents of df into spreadsheet. Note that this + overwrites the spreadsheet contents + """ + # Convert DataFrame to list of lists (including headers) + values = [df.columns.tolist()] + df.values.tolist() + + # Prepare the body for the API request + body = { + 'values': values + } + + # Update the sheet with DataFrame contents + try: + result = sheet.values().update( + spreadsheetId=spreadsheet_id, + range=f'{sheet_name}!A1', + valueInputOption='RAW', + body=body + ).execute() + except Exception as e: + logger.err(f"Failed to update {spreadsheet_id}:{sheet_name} due to {e}") + exit() + + updated_cells = result.get('updatedCells', 0) + logger.info(f"Successfully updated {updated_cells} cells") + return True, f"Successfully updated {updated_cells} cells" + +class GoogleSheetManager: + """ + handles the boilerplate of creating an authenticated + service and returning a spreadsheet object. + """ + def __init__(self): + self._service = None + + def connect(self, credentials_file:str) -> Resource: + """ + Connects to the Google Sheets API using service account credentials. + + Args: + credentials_file (str): Path to the Google service account credentials file. + + Returns: + build: The Google Sheets API service object. + """ + SCOPES = ['https://www.googleapis.com/auth/spreadsheets'] + CONFIG_FILE = credentials_file + + if not os.path.exists(CONFIG_FILE): + raise Exception(f"Configuration file not found: {CONFIG_FILE}") + + try: + creds = service_account.Credentials.from_service_account_file( + CONFIG_FILE, + scopes=SCOPES + ) + + self._service = build('sheets', 'v4', credentials=creds) + return self.service + + except Exception as e: + raise Exception(f"Failed to create Google Sheets service: {str(e)}") + + @property + def service(self) -> Resource: + """ + getter for service. + Using service as property allows instantiation of object and + authentication to be separated, while also ensuring that all calls + to service are authenticated + """ + if self._service is None: + raise ValueError("Connect not executed") + return self._service + + def sheet(self, spreadsheet_id:str, sheet_name:str) -> GoogleSheet: + """ + Uses instantiated service to fetch Google Sheet + `spreadsheet_id` and fetches the `sheet_name` spreadsheet + """ + try: + sheet = self.service.spreadsheets().values().get( + spreadsheetId=spreadsheet_id, # spreadsheet id is base64 in edit url + range=sheet_name + ).execute() + except Exception as e: + print(f"Failed to read {spreadsheet_id}:{sheet_name} due to {e}") + exit() + return GoogleSheet(sheet)