3 * UNFS3 file descriptor cache
4 * (C) 2004, Pascal Schmidt
5 * see file LICENSE for license details
10 #include <sys/types.h>
26 #include "Config/exports.h"
31 * intention of the file descriptor cache
33 * for READ operations, the intent is to open() the file on the first
34 * access and to close() it when we hit EOF or after two seconds of
37 * for WRITE operations, the intent is to open() the file on the first
38 * UNSTABLE access and to close() it when COMMIT is called or after
39 * two seconds of inactivity.
41 * There are three states of an entry:
42 * 1) Unused. use == 0.
43 * 2) Open fd. use != 0, fd != -1.
44 * 3) Pending fsync/close error, to be reported in next COMMIT or WRITE. use != 0, fd == -1.
46 * Handling fsync/close errors 100% correctly is very difficult for a
47 * user space server. Although rare, fsync/close might fail, for
48 * example when out of quota or closing a file on a NFS file
49 * system. The most correct way of handling these errors would be to
50 * keep track of "dirty" and failed ranges. However, this would
51 * require runtime memory allocation, with no known upper bound, which
52 * in turn can lead to DOS attacks etc. Our solution returns a
53 * fsync/close error in the first WRITE or COMMIT
54 * response. Additionally, the write verifier is changed. Subsequent
55 * COMMITs may succeed even though data has been lost, but since the
56 * verifier is changed, clients will notice this and re-send their
57 * data. Eventually, with some luck, all clients will get an IO error.
60 /* number of entries in fd cache */
61 #define FD_ENTRIES 256
63 /* The number of seconds to wait before closing inactive fd */
64 #define INACTIVE_TIMEOUT 2
66 /* The number of seconds to keep pending errors */
67 #define PENDING_ERROR_TIMEOUT 7200 /* 2 hours */
70 int fd; /* open file descriptor */
71 int kind; /* read or write */
72 time_t use; /* last use */
73 uint32 dev; /* device */
74 uint64 ino; /* inode */
75 uint32 gen; /* inode generation */
78 static fd_cache_t fd_cache[FD_ENTRIES];
81 int fd_cache_readers = 0;
82 int fd_cache_writers = 0;
85 * initialize the fd cache
87 void fd_cache_init(void)
91 for (i = 0; i < FD_ENTRIES; i++) {
93 fd_cache[i].kind = UNFS3_FD_READ;
102 * find cache index to use for new entry
103 * returns an empty slot if found, else return error
105 static int fd_cache_unused(void)
108 static time_t last_warning = 0;
110 for (i = 0; i < FD_ENTRIES; i++) {
111 if (fd_cache[i].use == 0)
115 /* Do not print warning more than once per 10 second */
116 if (time(NULL) > last_warning + 10) {
117 last_warning = time(NULL);
119 "fd cache full due to more than %d active files or pending IO errors",
128 * remove an entry from the cache. The keep_on_error variable
129 * indicates if the entry should be kept in the cache upon
130 * fsync/close failures. It should be set to TRUE when fd_cache_del is
131 * called from a code path which cannot report an IO error back to the
132 * client through WRITE or COMMIT.
134 static int fd_cache_del(int idx, int keep_on_error)
140 if (fd_cache[idx].fd != -1) {
141 if (fd_cache[idx].kind == UNFS3_FD_WRITE) {
142 /* sync file data if writing descriptor */
144 res1 = backend_fsync(fd_cache[idx].fd);
149 res2 = backend_close(fd_cache[idx].fd);
150 fd_cache[idx].fd = -1;
152 /* return -1 if something went wrong during sync or close */
153 if (res1 == -1 || res2 == -1) {
160 if (res1 == -1 && !keep_on_error) {
161 /* The verifier should not be changed until we actually report &
163 regenerate_write_verifier();
166 if (res1 != -1 || !keep_on_error) {
167 fd_cache[idx].fd = -1;
168 fd_cache[idx].use = 0;
169 fd_cache[idx].dev = 0;
170 fd_cache[idx].ino = 0;
171 fd_cache[idx].gen = 0;
178 * add an entry to the cache
180 static void fd_cache_add(int fd, unfs3_fh_t * ufh, int kind)
184 idx = fd_cache_unused();
186 /* update statistics */
187 if (kind == UNFS3_FD_READ)
192 fd_cache[idx].fd = fd;
193 fd_cache[idx].kind = kind;
194 fd_cache[idx].use = time(NULL);
195 fd_cache[idx].dev = ufh->dev;
196 fd_cache[idx].ino = ufh->ino;
197 fd_cache[idx].gen = ufh->gen;
202 * find entry by operating system fd number
204 static int idx_by_fd(int fd, int kind)
209 for (i = 0; i < FD_ENTRIES; i++)
210 if (fd_cache[i].fd == fd && fd_cache[i].kind == kind) {
218 * find entry by fh (device, inode, and generation number)
220 static int idx_by_fh(unfs3_fh_t * ufh, int kind)
225 for (i = 0; i < FD_ENTRIES; i++)
226 if (fd_cache[i].kind == kind) {
227 if (fd_cache[i].dev == ufh->dev && fd_cache[i].ino == ufh->ino &&
228 fd_cache[i].gen == ufh->gen) {
237 * open a file descriptor
238 * uses fd from cache if possible
240 int fd_open(const char *path, nfs_fh3 nfh, int kind, int allow_caching)
243 backend_statstruct buf;
244 unfs3_fh_t *fh = (void *) nfh.data.data_val;
246 idx = idx_by_fh(fh, kind);
249 if (fd_cache[idx].fd == -1) {
250 /* pending error, report to client and remove from cache */
251 fd_cache_del(idx, FALSE);
254 return fd_cache[idx].fd;
256 /* call open to obtain new fd */
257 if (kind == UNFS3_FD_READ)
258 fd = backend_open(path, O_RDONLY);
260 fd = backend_open(path, O_WRONLY);
264 /* check for local fs race */
265 res = backend_fstat(fd, &buf);
267 (fh->dev != buf.st_dev || fh->ino != buf.st_ino ||
268 fh->gen != backend_get_gen(buf, fd, path))) {
270 * local fs changed meaning of path between
271 * calling NFS operation doing fh_decomp and
274 * set errno to ELOOP to make calling NFS
275 * operation return NFS3ERR_STALE
283 * success, add to cache for later use
286 fd_cache_add(fd, fh, kind);
292 * close a file descriptor
293 * returns error number from real close() if applicable
295 int fd_close(int fd, int kind, int really_close)
297 int idx, res1 = 0, res2 = 0;
299 idx = idx_by_fd(fd, kind);
301 /* update usage time of cache entry */
302 fd_cache[idx].use = time(NULL);
304 if (really_close == FD_CLOSE_REAL)
305 /* delete entry on real close, will close() fd */
306 return fd_cache_del(idx, FALSE);
310 /* not in cache, sync and close directly */
311 if (kind == UNFS3_FD_WRITE)
312 res1 = backend_fsync(fd);
314 res2 = backend_close(fd);
324 * sync file descriptor data to disk
326 int fd_sync(nfs_fh3 nfh)
329 unfs3_fh_t *fh = (void *) nfh.data.data_val;
331 idx = idx_by_fh(fh, UNFS3_FD_WRITE);
333 /* delete entry, will fsync() and close() the fd */
334 return fd_cache_del(idx, FALSE);
340 * purge/shutdown the cache
342 void fd_cache_purge(void)
346 /* close any open file descriptors we still have */
347 for (i = 0; i < FD_ENTRIES; i++) {
348 if (fd_cache[i].use != 0) {
349 if (fd_cache_del(i, TRUE) == -1)
351 "Error during shutdown fsync/close for dev %lu, inode %lu",
352 fd_cache[i].dev, fd_cache[i].ino);
361 void fd_cache_close_inactive(void)
366 int active_error = 0;
369 for (i = 0; i < FD_ENTRIES; i++) {
370 /* Check for inactive open fds */
371 if (fd_cache[i].use && fd_cache[i].fd != -1 &&
372 fd_cache[i].use + INACTIVE_TIMEOUT < now) {
373 fd_cache_del(i, TRUE);
376 /* Check for inactive pending errors */
377 if (fd_cache[i].use && fd_cache[i].fd == -1) {
379 if (fd_cache[i].use + PENDING_ERROR_TIMEOUT > now)
384 if (found_error && !active_error) {
385 /* All pending errors are old. Delete them all from the table and
386 generate new verifier. This is done to prevent the table from
387 filling up with old pending errors, perhaps for files that never
388 will be written again. In this case, we throw away the errors, and
389 change the server verifier. If clients has pending COMMITs, they
390 will notify the changed verifier and re-send. */
391 for (i = 0; i < FD_ENTRIES; i++) {
392 if (fd_cache[i].use && fd_cache[i].fd == -1) {
393 fd_cache_del(i, FALSE);
396 regenerate_write_verifier();