bit-io.scm
;;; bit-io.scm  --  Jens Axel Søgaard -- april 2006

; This file started as a PLT port of Oleg's bit-reader
;    <http://okmij.org/ftp/Scheme/binary-io.html#bit-reader>,
; then a bit-writer was added, and finally bit-ports were
; added.

(module bit-io mzscheme
  (provide  with-input-from-bit-file
            with-output-to-bit-file
            open-input-bit-file
            open-output-bit-file
            close-input-bit-port
            close-output-bit-port
            read-bits
            write-bits
            current-output-bit-port
            current-input-bit-port
            flush-bits
            bit-file-position
            call-with-input-bit-file
            call-with-output-bit-file
            ; LOW LEVEL
            ;make-bit-reader
            ;make-bit-writer
            ;current-bit-reader
            ;current-bit-writer
            ;current-bit-flusher
            )
  
  ; A BIT-PORT consists of an underlying (byte) port and
  ; the current (bit)position.
  (define-struct bit-port (port pos))
  
  ; An INPUT-BIT-PORT is a bit-port with a concrete byte-reader
  ; which from which the corresponding bit-reader read is constructed.
  (define-struct (input-bit-port bit-port)  (byte-reader read))
  
  ; An INPUT-BIT-PORT is a bit-port with a concrete byte-writer
  ; which from which the corresponding bit-writer write is constructed.
  ; Bits aren't written to the underlying byte port until a whole
  ; byte is received - a flush operations is thus sometimes needed
  ; at the end of a file.
  (define-struct (output-bit-port bit-port) (byte-writer write flush))
  
  ; open-input-bit-file : path [symbol ... ] -> input-bit-port
  ;   analog to open-input-file
  (define (open-input-bit-file file . options)
    (let ([byte-port (apply open-input-file file options)])
      (make-input-bit-port byte-port 
                           0
                           (λ () (read-byte byte-port))
                           (make-bit-reader (λ () (read-byte byte-port))))))
  
  ; open-output-bit-file : path [symbol ...] -> output-bit-port
  ;   analog to open-output-file
  (define (open-output-bit-file file . options)
    (let ([byte-port (apply open-output-file file options)])
      (let-values ([(bit-writer bit-flusher) 
                    (make-bit-writer (λ (b) (write-byte b byte-port)))])
        (make-output-bit-port byte-port 
                              0
                              (λ (b) (write-byte b byte-port))
                              bit-writer
                              bit-flusher))))
  
  ; current bit-ports
  (define current-input-bit-port  (make-parameter 'none-yet))
  (define current-output-bit-port (make-parameter 'none-yet))
  
  ; close-input-bit-port : input-bit-port ->
  (define (close-input-bit-port bit-port)
    (close-input-port (bit-port-port bit-port)))
  
  ; close-output-bit-port : output-bit-port ->
  (define (close-output-bit-port bit-port)
    ((output-bit-port-flush bit-port))
    (close-output-port (bit-port-port bit-port)))
  
  ; with-input-from-bit-file : path (-> alpha) [symbol ...] -> alpha
  ;   analog to with-input-from-file
  (define (with-input-from-bit-file file thunk . options)
    (let ([in (apply open-input-bit-file file options)])
      (begin0
        (parameterize ([current-input-bit-port in])
          (thunk))
        (close-input-bit-port in))))
  
  ; with-output-to-bit-file : path (-> alpha) [symbol ...] -> alpha
  ;   analog to with-to-file
  (define (with-output-to-bit-file file thunk . options)
    (let ([out (apply open-output-bit-file file options)])
      (begin0
        (parameterize ([current-output-bit-port out])
          (thunk))
        (close-output-bit-port out))))
  
  (define (call-with-input-bit-file file proc . options)
    (let ([in (apply open-input-bit-file file options)])
      (begin0
        (proc in))
      (close-input-bit-port in)))
  
  (define (call-with-output-bit-file file proc . options)
    (let ([out (apply open-output-bit-file file options)])
      (begin0
        (proc out)
        (close-output-bit-port out))))
  
  ; write-bits : natural natural [output-bit-port] ->
  ;   write n lower bits from bs to the output-bit-port,
  ;   if no output-bit-port is given use current-output-bit-port
  (define write-bits
    (case-lambda 
      [(n bs)          (write-bits n bs (current-output-bit-port))]
      [(n bs bit-port) (begin
                         (set-bit-port-pos! bit-port (+ (bit-port-pos bit-port) n))
                         ((output-bit-port-write bit-port) n bs))]))
  
  ; read-bits : natural [input-bit-port] -> natural
  ;   read n bits from the input-bit-port,
  ;   if no input-bit-port is given, use current-input-bit-port
  (define read-bits
    (case-lambda 
      [(n)          (read-bits n (current-input-bit-port))]
      [(n bit-port) (begin
                      (set-bit-port-pos! bit-port (+ (bit-port-pos bit-port) n))
                      ((input-bit-port-read bit-port) n))]))
  
  ; flush-bits : [output-bit-port] ->
  ;    flush remaining bits in the cache by append zeros until a
  ;    whole byte can be written
  (define flush-bits
    (case-lambda
      [()             (flush-bits (current-output-bit-port))]
      [(out-bit-port) ((output-bit-port-flush out-bit-port))]))

  ; bit-file-position : bit-port -> natural
  ;   return the bit-position of the bit-port,
  ;   the bit-position of an input-bit-port is the number of bits read so far,
  ;   for an output-bit-port it is the number of bits written so far
  ; bit-file-position : bit-port natural -> natural
  ;   set the bit-position of the bit-port
  (define bit-file-position
    ; todo: this only sets the position on an input port
    (case-lambda 
      [(bit-port)   (bit-port-pos bit-port)]
      [(bit-port n) (begin
                      (unless (input-bit-port? bit-port)
                        (error 
                         (string-append 
                          "(bit-file-position bit-port n) is not implemented "
                          "for output bit ports.")))
                      (file-position (bit-port-port bit-port) (quotient n 8))
                      (read-bits (remainder n 8) bit-port))]))
  
  ;;;
  ;;; LOW LEVEL INTERFACE
  ;;;
  
  ; The following bit reader comes from
  ;     <http://okmij.org/ftp/Scheme/binary-io.html#bit-reader>
  
  ; Binary parsing
  
  ;----------------------------------------
  ; Apologia
  ;
  ; Binary parsing and unparsing are transformations between primitive or
  ; composite Scheme values and their external binary representations.
  ;
  ; Examples include reading and writing JPEG, TIFF, MP3, ELF file
  ; formats, communicating with DNS, Kerberos, LDAP, SLP internet
  ; services, participating in Sun RPC and CORBA/IIOP distributed systems,
  ; storing and retrieving (arrays of) floating-point numbers in a
  ; portable and efficient way. This project will propose a set of low- and
  ; intermediate- level procedures that make binary parsing possible.
  
  ; Scheme is a good language to do research in text compression. Text
  ; compression involves a great deal of building and traversing
  ; dictionaries, trees and similar data structures, where Scheme
  ; excels. Performance doesn't matter in research, but the size of
  ; compressed files does (to figure out the bpc for the common
  ; benchmarks). Variable-bit i/o is a necessity. It is implemented
  ; in the present file.
  
  ; ASN.1 corresponds to a higher-level parsing (LR parser
  ; vs. lexer). Information in LDAP responses and X.509 certificates is
  ; structural and recursive, and so lends itself to be processed in
  ; Scheme. Variable bit i/o is necessary, and so is a binary lexer for
  ; a LR parser. Parsing of ASN.1 is a highly profitable enterprise
  
  ;----------------------------------------
  ; The outline of the project
  ;
  ; Primitives and streams
  ;
  ; - read-byte
  ; - read-u8vector (cf. read-string)
  ; - with-input-from-u8vector, with-input-from-encoded-u8vector 'base64,...
  ; building binary i/o streams from a sequence of bytes. Streams over
  ; u8vector, u16vector, etc. provide a serial access to memory. See SRFI-4
  ;
  ; - read-bit, read-bits via overlayed streams given read-byte
  ; implemented in the present file.
  ;
  ; -  mmap-u8vector, munmap-u8vector
  ;
  ; Conversions
  ;  - u8vector->integer u8vector endianness,
  ;    u8vector->sinteger u8vector endianness
  ;  These conversion procedures turn a sequence of bytes to an unsigned or
  ;  signed integer, minding the byte order. The u8vector in question can
  ;  have size 1,2,4,8, 3 etc. bytes. These two functions therefore can be
  ;  used to read shorts, longs, extra longs, etc. numbers.
  ;  - u8vector-reverse and other useful u8vector operations
  ;
  ;  - modf, frexp, ldexp
  ;  The above primitives can be emulated in R5RS, yet they are quite handy
  ;  (for portable FP manipulation) and can be executed very efficiently by
  ;  an FPU.
  ;
  ; Higher-level parsing and combinators
  ; These are combinators that can compose primitives above for more
  ; complex (possibly iterative) actions.
  ;
  ; - skip-bits, next-u8token,...
  ; - IIOP, RPC/XDR, RMI
  ; - binary lexer for existing LR/LL-parsers
  ;
  ; The composition of primitives and combinators will represent binary
  ; parsing language in a _full_ notation. This is similar to XPath
  ; expressions in full notation. Later we need to find out the
  ; most-frequently used patterns of the binary parsing language and
  ; design an abbreviated notation. The latter will need a special
  ; "interpreter". The abbreviated notation may turn out to look like
  ; Olin's regular expressions.
  
  
  ;;========================================================================
  ;;			Configuration section
  ;;
  ; Performance is very important for binary parsing. We have to get all
  ; help from a particular Scheme system we can get. If a Scheme function
  ; can support the following primitives faster, we should take
  ; advantage of that fact.
  
  ;; Configuration for PLT
  
  (define-syntax <<  (syntax-rules () [(_ x n) (arithmetic-shift x n)]))
  (define-syntax >>  (syntax-rules () [(_ x n) (arithmetic-shift x (- n))]))
  (define-syntax <<1 (syntax-rules () [(_ x)   (arithmetic-shift x 1)]))
  (define-syntax >>1 (syntax-rules () [(_ x)   (arithmetic-shift x -1)]))
  
  (define-syntax bit-set? (syntax-rules () [(_ x mask) (not (zero? (bitwise-and x mask)))]))
  
  ;; End configuration for PLT
  
  ; combine bytes in the MSB order. A byte may be #f
  (define (combine-two b1 b2)		; The result is for sure a fixnum
    (and b1 b2 (bitwise-ior (<< b1 8) b2)))
  
  (define (combine-three b1 b2 b3)	; The result is for sure a fixnum
    (and b1 b2 b3 (bitwise-ior (<< (bitwise-ior (<< b1 8) b2) 8) b3)))
  
  ; Here the result may be a BIGNUM
  (define (combine-bytes . bytes)
    (cond 
      ((null? bytes) 0)
      ((not (car bytes)) #f)
      (else 
       (let loop ((bytes (cdr bytes)) (result (car bytes)))
         (cond
           ((null? bytes) result)
           ((not (car bytes)) #f)
           (else (loop (cdr bytes) (+ (car bytes) (* 256 result)))))))))
  
  ;========================================================================
  ;			   Reading a byte
  
  ; Read-byte is a fundamental primitive; it needs to be
  ; added to the standard. Most of the other functions are library
  ; procedures. The following is an approximation, which clearly doesn't
  ; hold if the port is a Unicode (especially UTF-8) character stream.
  
  ; The mzscheme read-byte is used.
  
  ; Return a byte as an exact integer [0,255], or the EOF object
  #;(define (read-byte port)
      (let ((c (read-char port)))
        (if (eof-object? c) c (char->integer c))))
  
  ; The same as above, but returns #f on EOF.
  (define (read-byte-f port)
    (let ([b (read-byte port)])
      (and (not (eof-object? b)) b)))
  
  
  ;========================================================================
  ;			Bit stream
  
  ; -- Function: make-bit-reader BYTE-READER
  
  ; Given a BYTE-READER (a thunk), construct and return a function
  ;	bit-reader N
  ;
  ; that reads N bits from a byte-stream represented by the BYTE-READER.
  ; The BYTE-READER is a function that takes no arguments and returns
  ; the current byte as an exact integer [0-255]. The byte reader
  ; should return #f on EOF.
  ; The bit reader returns N bits as an exact unsigned integer,
  ; 0 -... (no limit). N must be a positive integer, otherwise the bit reader
  ; returns #f. There is no upper limit on N -- other than the size of the
  ; input stream itself and the amount of (virtual) memory an OS is willing
  ; to give to your process. If you want to read 1M of _bits_, go ahead.
  ;
  ; It is assumed that the bit order is the most-significant bit first.
  ;
  ; Note the bit reader keeps the following condition true at all times:
  ;	(= current-inport-pos (ceiling (/ no-bits-read 8)))
  ; That is, no byte is read until the very moment we really need (some of)
  ; its bits. The bit reader does _not_ "byte read ahead".
  ; Therefore, it can be used to handle a concatenation of different
  ; bit/byte streams *STRICTLY* sequentially, _without_ 'backing up a char',
  ; 'unreading-char' etc. tricks.
  ; For example, make-bit-reader has been used to read GRIB files of
  ; meteorological data, which made of several bitstreams with headers and
  ; tags.
  ; Thus careful attention to byte-buffering and optimization are the
  ; features of this bit reader.
  ;
  ; Usage example:
  ;	(define bit-reader (make-bit-reader (lambda () #b11000101)))
  ;	(bit-reader 3) ==> 6
  ;	(bit-reader 4) ==> 2
  ; The test driver below is another example.
  ;
  ; Notes on the algorithm.
  ; The function recognizes and handles the following special cases:
  ;  - the buffer is empty and 8, 16, 24 bits are to be read
  ;  - reading all bits which are currently in the byte-buffer
  ;    (and then maybe more)
  ;  - reading only one bit
  
  ; Since the bit reader is going to be called many times, optimization is
  ; critical. We need all the help from the compiler/interpreter
  ; we can get.
  
  (define (make-bit-reader byte-reader)
    (let ((buffer 0) (mask 0)     ; mask = 128 means that the buffer is full and
                     ; the msb bit is the current (yet unread) bit
                     (bits-in-buffer 0))
      
      ; read the byte into the buffer and set up the counters.
      ; return #f on eof
      (define (set-buffer)
        (set! buffer (byte-reader))
        (and buffer
             (begin
               (set! bits-in-buffer 8)
               (set! mask 128)
               #t)))
      
      ; Read fewer bits than there are in the buffer
      (define (read-few-bits n)
        (let ((value (bitwise-and buffer 	; all bits in buffer
                                  (sub1 (<<1 mask)))))
          (set! bits-in-buffer (- bits-in-buffer n))
          (set! mask (>> mask n))
          (>> value bits-in-buffer))) ; remove extra bits
      
      ; read n bits given an empty buffer, and append them to value, n>=8
      (define (add-more-bits value n)
        (let loop ((value value) (n n))
          (cond
	    ((zero? n) value)
	    ((< n 8)
	     (let ((rest (read-n-bits n)))
	       (and rest (+ (* value (<< 1 n)) rest))))
	    (else
	     (let ((b (byte-reader)))
	       (and b (loop (+ (* value 256) b) (- n 8))))))))
      
      ; The main module
      (define (read-n-bits n)
        ; Check the most common cases first
        (cond
	  ((not (positive? n)) #f)
	  ((zero? bits-in-buffer)	; the bit-buffer is empty
	   (case n
	     ((8) (byte-reader))
	     ((16) 
	      (let ((b (byte-reader)))
		(combine-two b (byte-reader))))
	     ((24)
	      (let* ((b1 (byte-reader)) (b2 (byte-reader)))
		(combine-three b1 b2 (byte-reader))))
	     (else
	      (cond
                ((< n 8)
                 (and (set-buffer) (read-few-bits n)))
                ((< n 16)
                 (let ((b (byte-reader)))
                   (and (set-buffer)
                        (bitwise-ior (<< b (- n 8))
                                     (read-few-bits (- n 8))))))
                (else
                 (let ((b (byte-reader)))
                   (and b (add-more-bits b (- n 8)))))))))
          
	  ((= n 1)			; read one bit
	   (let ((value (if (bit-set? buffer mask) 1 0)))
	     (set! mask (>>1 mask))
	     (set! bits-in-buffer (sub1 bits-in-buffer))
	     value))
          
	  ((>= n bits-in-buffer)	; will empty the buffer
	   (let ((n-rem (- n bits-in-buffer))
		 (value (bitwise-and buffer 	; for mask=64, it'll be &63
                                     (sub1 (<<1 mask)))))
	     (set! bits-in-buffer 0)
	     (cond
               ((zero? n-rem) value)
               ((<= n-rem 16)
                (let ((rest (read-n-bits n-rem)))
                  (and rest (bitwise-ior (<< value n-rem) rest))))
               (else (add-more-bits value n-rem)))))
	  (else (read-few-bits n))))
      read-n-bits)
    )
  
  ;;;
  ;;; BIT WRITER
  ;;;
  
  ; -- Function: make-bit-writer BYTE-WRITER
  
  ; Given a BYTE-WRITER (function of one argument), construct and return a function
  ;	bit-writer N B
  ;
  ; that writes N bits represented by the integer B to a byte-stream represented
  ; by the BYTE-WRITER.
  ; The BYTE-WRITER is a function that takes one argument and writes
  ; the given byte as an exact integer [0-255].
  
  ; It is assumed that the bit order is the most-significant bit first.
  ;
  ; Note the bit writer will output bytes as soon as possible. That is
  ; the maximum number of waiting bits are 7. Call bit-writer with a
  ; non-number as argument to flush the remainin bits.
  
  (define (make-bit-writer byte-writer)
    (let ((buffer 0) 
          (bits-in-buffer 0))
      
      (define (empty-buffer!)
        (set! buffer 0)
        (set! bits-in-buffer 0))
      
      (define (low-bits n b)
        (bitwise-and b (vector-ref #(0 1 3 7 15 31 63 127 255) n)))
      
      (define (extend-buffer! n b)
        (set! buffer (bitwise-ior (<< buffer n) b))
        (set! bits-in-buffer (+ bits-in-buffer n)))
      
      (define (set-buffer! n b)
        (set! buffer b)
        (set! bits-in-buffer n))
      
      (define (integer-length n)
        (unless (and (integer? n) (not (negative? n)))
          (error "a non-negative integer was expected, got: " n))
        (if (<= n 1)
            1
            (+ 1 (integer-length (arithmetic-shift n -1)))))
      
      (define (flush-buffer)
        (byte-writer (<< buffer (- 8 bits-in-buffer))))
      
      (define (write-n-bits n b)
        (when (and (number? n) (not (zero? n))
                   (> (integer-length b) n))
          (error "doh!" (list n b)))
        ; (set! b (low-bits n b))
        (cond
          ((not (positive? n))
           #f)
	  ((zero? bits-in-buffer)	; the bit-buffer is empty
	   (case n
	     ((8)  (byte-writer b))
	     ((16) (byte-writer (>> (bitwise-and #b1111111100000000 b) 8))
                   (byte-writer (bitwise-and #b11111111 b)))
             (else
              (let ([r (remainder n 8)])
                (cond
                  ((zero? r)  (for-each byte-writer
                                        (let loop ([n n] [b b] [l '()])
                                          (if (= n 0)  
                                              l
                                              (loop (- n 8)
                                                    (>> b 8)
                                                    (cons (bitwise-and #b11111111 b) l))))))
                  ((< n 8)  (set-buffer! n b))
                  ((< n 16) (byte-writer (>> b (- n 8)))
                            (set-buffer! (- n 8) (bitwise-and b (>> #b11111111 (- 16 n)))))
                  (else     (let ([bits-to-buffer (remainder n 8)]) ; output all whole bytes, and buffer the rest
                              (write-n-bits (- n bits-to-buffer)
                                            (>> b bits-to-buffer))
                              (set-buffer! bits-to-buffer                
                                           (bitwise-and b (vector-ref #(0 1 3 7 15 31 63 127 255) r))))))))))
          ((< n (- 8 bits-in-buffer))  ; everything goes to the buffer
           (extend-buffer! n b))
          (else                        
           (let ([m (- 8 bits-in-buffer)])
             ;(display (list buffer bits-in-buffer n) (current-error-port))
             ; (flush-output (current-error-port))
             ; the buffer and the initial bits make a byte
             
             (byte-writer (bitwise-ior (<< buffer m)
                                       (>> b (- n m))))
             (empty-buffer!)
             ; write the rest
             (write-n-bits (- n m) (bitwise-xor 
                                    b (<< (>> b (- n m))
                                          (- n m))))))))
      (values write-n-bits flush-buffer)))
  
  
  ;;; TEST
  
  #;
  (define (test)
    (define (naturals n)
      (do ([i 0 (+ i 1)]
           [l '() (cons i l)])
        [(= i n) l]))
    
    ; write the numbers 999 ... 1 to "tmp" and read them again
    
    (with-output-to-file "tmp"
      (lambda ()
        (let-values 
            ([(write flush) (make-bit-writer  write-byte)])
          (for-each (lambda (n)
                      (write n n))
                    (naturals 100))
          (flush)))
      'replace)
    
    (with-input-from-file "tmp"
      (lambda ()
        (let ([r (make-bit-reader read-byte)])
          (for-each (lambda (n)
                      (display (r n))
                      (display " "))
                    (naturals 100))))))
  
  ;;;
  ;;; PARAMETERS
  ;;;
  
  (define current-bit-reader (make-parameter (make-bit-reader read-byte)))
  
  (define-values  (current-bit-writer current-bit-flusher)
    (let-values ([(writer flusher) (make-bit-writer write-byte)])
      (values (make-parameter writer)
              (make-parameter flusher))))
  
  
  )