Answer of exercise 9

We reuse the data structures already defined above and collect them in a Tarlib module. We define a warning function which does not stop the program or alter the return code of the program.

open Sys open Unix open Tarlib let warning path message = prerr_endline (path ^ ": " ^ message)

We start with the function that writes a record header in a buffer. It’s a tedious function but it must be done with care as a single error in a header can corrupt the entire archive. In particular we must pay attention to the limits imposed by the file format. For example the size of paths is limited to 99 bytes (There are extensions to the format to handle longer path but it’s not the goal of this project).

let write_header_to_buffer source infos kind = let size = if kind = REG then infos.st_size else 0 in String.fill buffer 0 block_size '\000'; let put len string offset = String.blit string 0 buffer offset (min (String.length string) len) in let put_int8 x = put 7 (Printf.sprintf "%07o" x) in let put_int12 x = put 11 (Printf.sprintf "%011o" x) in let put_char c offset = buffer.[offset] <- c in let put_path s offset = if String.length s <= 99 then put 99 s offset else raise (Error ("path too long", s)) in put_path (if kind = DIR then source ^ "/" else source) 0; put_int8 infos.st_perm 100; put_int8 infos.st_uid 108; put_int8 infos.st_gid 116; put_int12 size 124; put_int12 (int_of_float infos.st_mtime) 136; put 7 "ustar " 257; put 31 (getpwuid infos.st_uid).pw_name 265; put 31 (getgrgid infos.st_gid).gr_name 297; (* Fields dev and rdev are only used for special files, which we omit *) put_char begin match kind with | REG -> '0' | LINK s -> put_path s 157; '1' | LNK s -> put_path s 157; '2' | DIR -> '5' | _ -> failwith "Special files not implemented" end 156; let rec sum s i = if i < 0 then s else sum (s + Char.code buffer.[i]) (pred i) in let checksum = sum (Char.code ' ' * 8) (block_size - 1) in put 8 (Printf.sprintf "%06o\000 " checksum) 148;;

The following function creates a record header for a file. source is the file name, infos is the stats information of the file and kind is the type of file.

let header source infos kind = { name = source; size = if kind = REG then infos.st_size else 0; perm = infos.st_perm; mtime = int_of_float infos.st_mtime; uid = infos.st_uid; gid = infos.st_gid; user = (getpwuid infos.st_uid).pw_name; group = (getgrgid infos.st_gid).gr_name; kind = kind }

To write a file in the archive, we define a variant of file_copy which takes as an argument the number of bytes to copy and verifies that the end of file corresponds to that size. Otherwise, an error is raised: this handles the abnormal case where a file is modified during the archival process. To limit the archive’s corruption to a single file we do not write beyond that size.

let write_file len source fdout = let fdin = openfile source [O_RDONLY] 0 in let error () = raise (Error ("File changed size", source)) in let rec copy_loop len = match read fdin buffer 0 buffer_size with 0 -> close fdin; if len > 0 then error () | r -> let len = len - r in if len < 0 then (close fdin; error ()); ignore (write fdout buffer 0 r); copy_loop len in copy_loop len;; let padding fd len = if len > 0 then ignore (write fd (String.make len '\000') 0 len);;

We now tackle the creation of the archive. The files already written in the archive are stored in a hashtable with their path so that they are not copied more than once. We also store the directories that were already written so as not to copy them again: it can happen that the archival root is already contained in another and we don’t want to copy it again (even though that would be harmless).

The data needed to write an archive is a file descriptor pointing on the file to write, the file and directory cache (see above) and a size variable that remembers the current archive size (to pad it to a minimal size if needed). The archive type collects all this information in a record:

type archive = { regfiles : (int * int, string) Hashtbl.t; dirfiles : (int * int, bool) Hashtbl.t; fd : file_descr; st : stats; mutable size : int } let try_new_dir archive dir = try Hashtbl.find archive.dirfiles dir with Not_found -> Hashtbl.add archive.dirfiles dir false; true

Here is the main function that writes an entire hierarchy starting from a file path given on the command line. This function is not difficult but needs some care with pathological cases. In particular we saw how to detect when a file is modified the archival. A sub case of this when the archive is being archived itself…

let verbose = ref true;; let write_from archive file = if not (Filename.is_relative file) then raise (Error ("absolute path", file)); let rec write_rec archive file = let source = if Filename.basename file = "" then Filename.dirname file else file in if !verbose then begin prerr_endline source end; let st = lstat source in if st.st_ino = archive.st.st_ino && st.st_dev = archive.st.st_dev then warning source "Skipping archive itself!" else let write_header kind = write_header_to_buffer source st kind; ignore (write archive.fd buffer 0 block_size) in match st.st_kind with S_REG -> begin try if st.st_nlink = 1 then raise Not_found; let path = Hashtbl.find archive.regfiles (st.st_ino, st.st_dev) in write_header (LINK path); with Not_found -> if st.st_nlink > 1 then Hashtbl.add archive.regfiles (st.st_ino, st.st_dev) source; write_header REG; write_file st.st_size source archive.fd; let t = (block_size-1 + st.st_size) / block_size * block_size in padding archive.fd (t - st.st_size); archive.size <- archive.size + t + block_size; end | S_LNK -> write_header (LNK (readlink source)); | S_DIR when try_new_dir archive (st.st_ino, st.st_dev) -> write_header DIR; Misc.iter_dir begin fun file -> if file = Filename.current_dir_name then () else if file = Filename.parent_dir_name then () else write_rec archive (source ^ "/" ^ file) end source | S_DIR -> warning source "Ignoring directory already in archive." | _ -> prerr_endline ("Can't cope with special file " ^ source) in write_rec archive file;;

We keep track of regular files that may have hard links in the regfiles table. It’s not necessary for files that have a single link.

Here’s the main function. In case of error, it is better to remove the erroneous archive.

let min_archive_size = 20 * block_size;; let build tarfile files = let fd, remove = if tarfile = "-" then stdout, ignore else openfile tarfile [ O_WRONLY; O_CREAT; O_TRUNC ] 0o666, unlink in try let arch = { regfiles = Hashtbl.create 13; dirfiles = Hashtbl.create 13; st = fstat fd; fd = fd; size =0 } in Array.iter (write_from arch) files; padding fd (min_archive_size - arch.size); close fd with z -> remove tarfile; close fd; raise z;;

We end by parsing the command line arguments.

let usage () = prerr_endline "Usage: tar -cvf tarfile file1 [ file2 ... ] "; exit 2;; let tar () = let argn = Array.length Sys.argv in if argn > 3 && Sys.argv.(1) = "-cvf" then build Sys.argv.(2) (Array.sub Sys.argv 3 (argn-3)) else usage ();; let _ = try handle_unix_error tar () with Error (mes, s) -> prerr_endline ("Error: " ^ mes ^ ": " ^ s); exit 1;;

* * *