1/*
2 *   Copyright (c) International Business Machines Corp., 2000-2002
3 *   Portions Copyright (c) Christoph Hellwig, 2001-2002
4 *
5 *   This program is free software;  you can redistribute it and/or modify
6 *   it under the terms of the GNU General Public License as published by
7 *   the Free Software Foundation; either version 2 of the License, or
8 *   (at your option) any later version.
9 *
10 *   This program is distributed in the hope that it will be useful,
11 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
12 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
13 *   the GNU General Public License for more details.
14 *
15 *   You should have received a copy of the GNU General Public License
16 *   along with this program;  if not, write to the Free Software
17 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20/*
21 *      jfs_txnmgr.c: transaction manager
22 *
23 * notes:
24 * transaction starts with txBegin() and ends with txCommit()
25 * or txAbort().
26 *
27 * tlock is acquired at the time of update;
28 * (obviate scan at commit time for xtree and dtree)
29 * tlock and mp points to each other;
30 * (no hashlist for mp -> tlock).
31 *
32 * special cases:
33 * tlock on in-memory inode:
34 * in-place tlock in the in-memory inode itself;
35 * converted to page lock by iWrite() at commit time.
36 *
37 * tlock during write()/mmap() under anonymous transaction (tid = 0):
38 * transferred (?) to transaction at commit time.
39 *
40 * use the page itself to update allocation maps
41 * (obviate intermediate replication of allocation/deallocation data)
42 * hold on to mp+lock thru update of maps
43 */
44
45
46#include <linux/fs.h>
47#include <linux/vmalloc.h>
48#include <linux/smp_lock.h>
49#include <linux/completion.h>
50#include "jfs_incore.h"
51#include "jfs_filsys.h"
52#include "jfs_metapage.h"
53#include "jfs_dinode.h"
54#include "jfs_imap.h"
55#include "jfs_dmap.h"
56#include "jfs_superblock.h"
57#include "jfs_debug.h"
58
59/*
60 *      transaction management structures
61 */
62static struct {
63	/* tblock */
64	int freetid;		/* index of a free tid structure */
65	wait_queue_head_t freewait;	/* eventlist of free tblock */
66
67	/* tlock */
68	int freelock;		/* index first free lock word */
69	wait_queue_head_t freelockwait;	/* eventlist of free tlock */
70	wait_queue_head_t lowlockwait;	/* eventlist of ample tlocks */
71	int tlocksInUse;	/* Number of tlocks in use */
72	spinlock_t LazyLock;	/* synchronize sync_queue & unlock_queue */
73/*	struct tblock *sync_queue; * Transactions waiting for data sync */
74	struct tblock *unlock_queue;	/* Txns waiting to be released */
75	struct tblock *unlock_tail;	/* Tail of unlock_queue */
76	struct list_head anon_list;	/* inodes having anonymous txns */
77	struct list_head anon_list2;	/* inodes having anonymous txns
78					   that couldn't be sync'ed */
79} TxAnchor;
80
81static int nTxBlock = 512;	/* number of transaction blocks */
82struct tblock *TxBlock;	        /* transaction block table */
83
84static int nTxLock = 4096;	/* number of transaction locks */
85static int TxLockLWM = 4096*.4;	/* Low water mark for number of txLocks used */
86static int TxLockHWM = 4096*.8;	/* High water mark for number of txLocks used */
87struct tlock *TxLock;           /* transaction lock table */
88static int TlocksLow = 0;	/* Indicates low number of available tlocks */
89
90
91/*
92 *      transaction management lock
93 */
94static spinlock_t jfsTxnLock = SPIN_LOCK_UNLOCKED;
95
96#define TXN_LOCK()              spin_lock(&jfsTxnLock)
97#define TXN_UNLOCK()            spin_unlock(&jfsTxnLock)
98
99#define LAZY_LOCK_INIT()	spin_lock_init(&TxAnchor.LazyLock);
100#define LAZY_LOCK(flags)	spin_lock_irqsave(&TxAnchor.LazyLock, flags)
101#define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags)
102
103DECLARE_WAIT_QUEUE_HEAD(jfs_sync_thread_wait);
104DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait);
105
106/*
107 * Retry logic exist outside these macros to protect from spurrious wakeups.
108 */
109static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
110{
111	DECLARE_WAITQUEUE(wait, current);
112
113	add_wait_queue(event, &wait);
114	set_current_state(TASK_UNINTERRUPTIBLE);
115	TXN_UNLOCK();
116	schedule();
117	current->state = TASK_RUNNING;
118	remove_wait_queue(event, &wait);
119}
120
121#define TXN_SLEEP(event)\
122{\
123	TXN_SLEEP_DROP_LOCK(event);\
124	TXN_LOCK();\
125}
126
127#define TXN_WAKEUP(event) wake_up_all(event)
128
129
130/*
131 *      statistics
132 */
133struct {
134	tid_t maxtid;		/* 4: biggest tid ever used */
135	lid_t maxlid;		/* 4: biggest lid ever used */
136	int ntid;		/* 4: # of transactions performed */
137	int nlid;		/* 4: # of tlocks acquired */
138	int waitlock;		/* 4: # of tlock wait */
139} stattx;
140
141
142/*
143 * external references
144 */
145extern int lmGroupCommit(struct jfs_log * log, struct tblock * tblk);
146extern void lmSync(struct jfs_log *);
147extern int jfs_commit_inode(struct inode *, int);
148extern int jfs_stop_threads;
149
150struct task_struct *jfsCommitTask;
151extern struct completion jfsIOwait;
152
153/*
154 * forward references
155 */
156int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
157	  struct tlock * tlck, struct commit * cd);
158int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
159	    struct tlock * tlck);
160void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
161	   struct tlock * tlck);
162void inlineLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
163	       struct tlock * tlck);
164void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
165	    struct tlock * tlck);
166void txAbortCommit(struct commit * cd, int exval);
167static void txAllocPMap(struct inode *ip, struct maplock * maplock,
168			struct tblock * tblk);
169void txForce(struct tblock * tblk);
170static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd);
171int txMoreLock(void);
172static void txUpdateMap(struct tblock * tblk);
173static void txRelease(struct tblock * tblk);
174void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
175	   struct tlock * tlck);
176static void LogSyncRelease(struct metapage * mp);
177
178/*
179 *              transaction block/lock management
180 *              ---------------------------------
181 */
182
183/*
184 * Get a transaction lock from the free list.  If the number in use is
185 * greater than the high water mark, wake up the sync daemon.  This should
186 * free some anonymous transaction locks.  (TXN_LOCK must be held.)
187 */
188static lid_t txLockAlloc(void)
189{
190	lid_t lid;
191
192	while (!(lid = TxAnchor.freelock))
193		TXN_SLEEP(&TxAnchor.freelockwait);
194	TxAnchor.freelock = TxLock[lid].next;
195	HIGHWATERMARK(stattx.maxlid, lid);
196	if ((++TxAnchor.tlocksInUse > TxLockHWM) && (TlocksLow == 0)) {
197		jEVENT(0,("txLockAlloc TlocksLow\n"));
198		TlocksLow = 1;
199		wake_up(&jfs_sync_thread_wait);
200	}
201
202	return lid;
203}
204
205static void txLockFree(lid_t lid)
206{
207	TxLock[lid].next = TxAnchor.freelock;
208	TxAnchor.freelock = lid;
209	TxAnchor.tlocksInUse--;
210	if (TlocksLow && (TxAnchor.tlocksInUse < TxLockLWM)) {
211		jEVENT(0,("txLockFree TlocksLow no more\n"));
212		TlocksLow = 0;
213		TXN_WAKEUP(&TxAnchor.lowlockwait);
214	}
215	TXN_WAKEUP(&TxAnchor.freelockwait);
216}
217
218/*
219 * NAME:        txInit()
220 *
221 * FUNCTION:    initialize transaction management structures
222 *
223 * RETURN:
224 *
225 * serialization: single thread at jfs_init()
226 */
227int txInit(void)
228{
229	int k, size;
230
231	/*
232	 * initialize transaction block (tblock) table
233	 *
234	 * transaction id (tid) = tblock index
235	 * tid = 0 is reserved.
236	 */
237	size = sizeof(struct tblock) * nTxBlock;
238	TxBlock = (struct tblock *) vmalloc(size);
239	if (TxBlock == NULL)
240		return ENOMEM;
241
242	for (k = 1; k < nTxBlock - 1; k++) {
243		TxBlock[k].next = k + 1;
244		init_waitqueue_head(&TxBlock[k].gcwait);
245		init_waitqueue_head(&TxBlock[k].waitor);
246	}
247	TxBlock[k].next = 0;
248	init_waitqueue_head(&TxBlock[k].gcwait);
249	init_waitqueue_head(&TxBlock[k].waitor);
250
251	TxAnchor.freetid = 1;
252	init_waitqueue_head(&TxAnchor.freewait);
253
254	stattx.maxtid = 1;	/* statistics */
255
256	/*
257	 * initialize transaction lock (tlock) table
258	 *
259	 * transaction lock id = tlock index
260	 * tlock id = 0 is reserved.
261	 */
262	size = sizeof(struct tlock) * nTxLock;
263	TxLock = (struct tlock *) vmalloc(size);
264	if (TxLock == NULL) {
265		vfree(TxBlock);
266		return ENOMEM;
267	}
268
269	/* initialize tlock table */
270	for (k = 1; k < nTxLock - 1; k++)
271		TxLock[k].next = k + 1;
272	TxLock[k].next = 0;
273	init_waitqueue_head(&TxAnchor.freelockwait);
274	init_waitqueue_head(&TxAnchor.lowlockwait);
275
276	TxAnchor.freelock = 1;
277	TxAnchor.tlocksInUse = 0;
278	INIT_LIST_HEAD(&TxAnchor.anon_list);
279	INIT_LIST_HEAD(&TxAnchor.anon_list2);
280
281	stattx.maxlid = 1;	/* statistics */
282
283	return 0;
284}
285
286/*
287 * NAME:        txExit()
288 *
289 * FUNCTION:    clean up when module is unloaded
290 */
291void txExit(void)
292{
293	vfree(TxLock);
294	TxLock = 0;
295	vfree(TxBlock);
296	TxBlock = 0;
297}
298
299
300/*
301 * NAME:        txBegin()
302 *
303 * FUNCTION:    start a transaction.
304 *
305 * PARAMETER:   sb	- superblock
306 *              flag	- force for nested tx;
307 *
308 * RETURN:	tid	- transaction id
309 *
310 * note: flag force allows to start tx for nested tx
311 * to prevent deadlock on logsync barrier;
312 */
313tid_t txBegin(struct super_block *sb, int flag)
314{
315	tid_t t;
316	struct tblock *tblk;
317	struct jfs_log *log;
318
319	jFYI(1, ("txBegin: flag = 0x%x\n", flag));
320	log = JFS_SBI(sb)->log;
321
322	TXN_LOCK();
323
324      retry:
325	if (!(flag & COMMIT_FORCE)) {
326		/*
327		 * synchronize with logsync barrier
328		 */
329		if (test_bit(log_SYNCBARRIER, &log->flag) ||
330		    test_bit(log_QUIESCE, &log->flag)) {
331			TXN_SLEEP(&log->syncwait);
332			goto retry;
333		}
334	}
335	if (flag == 0) {
336		/*
337		 * Don't begin transaction if we're getting starved for tlocks
338		 * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately
339		 * free tlocks)
340		 */
341		if (TlocksLow) {
342			TXN_SLEEP(&TxAnchor.lowlockwait);
343			goto retry;
344		}
345	}
346
347	/*
348	 * allocate transaction id/block
349	 */
350	if ((t = TxAnchor.freetid) == 0) {
351		jFYI(1, ("txBegin: waiting for free tid\n"));
352		TXN_SLEEP(&TxAnchor.freewait);
353		goto retry;
354	}
355
356	tblk = tid_to_tblock(t);
357
358	if ((tblk->next == 0) && (current != jfsCommitTask)) {
359		/* Save one tblk for jfsCommit thread */
360		jFYI(1, ("txBegin: waiting for free tid\n"));
361		TXN_SLEEP(&TxAnchor.freewait);
362		goto retry;
363	}
364
365	TxAnchor.freetid = tblk->next;
366
367	/*
368	 * initialize transaction
369	 */
370
371	/*
372	 * We can't zero the whole thing or we screw up another thread being
373	 * awakened after sleeping on tblk->waitor
374	 *
375	 * memset(tblk, 0, sizeof(struct tblock));
376	 */
377	tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0;
378
379	tblk->sb = sb;
380	++log->logtid;
381	tblk->logtid = log->logtid;
382
383	++log->active;
384
385	HIGHWATERMARK(stattx.maxtid, t);	/* statistics */
386	INCREMENT(stattx.ntid);	/* statistics */
387
388	TXN_UNLOCK();
389
390	jFYI(1, ("txBegin: returning tid = %d\n", t));
391
392	return t;
393}
394
395
396/*
397 * NAME:        txBeginAnon()
398 *
399 * FUNCTION:    start an anonymous transaction.
400 *		Blocks if logsync or available tlocks are low to prevent
401 *		anonymous tlocks from depleting supply.
402 *
403 * PARAMETER:   sb	- superblock
404 *
405 * RETURN:	none
406 */
407void txBeginAnon(struct super_block *sb)
408{
409	struct jfs_log *log;
410
411	log = JFS_SBI(sb)->log;
412
413	TXN_LOCK();
414
415      retry:
416	/*
417	 * synchronize with logsync barrier
418	 */
419	if (test_bit(log_SYNCBARRIER, &log->flag) ||
420	    test_bit(log_QUIESCE, &log->flag)) {
421		TXN_SLEEP(&log->syncwait);
422		goto retry;
423	}
424
425	/*
426	 * Don't begin transaction if we're getting starved for tlocks
427	 */
428	if (TlocksLow) {
429		TXN_SLEEP(&TxAnchor.lowlockwait);
430		goto retry;
431	}
432	TXN_UNLOCK();
433}
434
435
436/*
437 *      txEnd()
438 *
439 * function: free specified transaction block.
440 *
441 *      logsync barrier processing:
442 *
443 * serialization:
444 */
445void txEnd(tid_t tid)
446{
447	struct tblock *tblk = tid_to_tblock(tid);
448	struct jfs_log *log;
449
450	jFYI(1, ("txEnd: tid = %d\n", tid));
451	TXN_LOCK();
452
453	/*
454	 * wakeup transactions waiting on the page locked
455	 * by the current transaction
456	 */
457	TXN_WAKEUP(&tblk->waitor);
458
459	log = JFS_SBI(tblk->sb)->log;
460
461	/*
462	 * Lazy commit thread can't free this guy until we mark it UNLOCKED,
463	 * otherwise, we would be left with a transaction that may have been
464	 * reused.
465	 *
466	 * Lazy commit thread will turn off tblkGC_LAZY before calling this
467	 * routine.
468	 */
469	if (tblk->flag & tblkGC_LAZY) {
470		jFYI(1,
471		     ("txEnd called w/lazy tid: %d, tblk = 0x%p\n",
472		      tid, tblk));
473		TXN_UNLOCK();
474
475		spin_lock_irq(&log->gclock);	// LOGGC_LOCK
476		tblk->flag |= tblkGC_UNLOCKED;
477		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
478		return;
479	}
480
481	jFYI(1, ("txEnd: tid: %d, tblk = 0x%p\n", tid, tblk));
482
483	assert(tblk->next == 0);
484
485	/*
486	 * insert tblock back on freelist
487	 */
488	tblk->next = TxAnchor.freetid;
489	TxAnchor.freetid = tid;
490
491	/*
492	 * mark the tblock not active
493	 */
494	--log->active;
495
496	/*
497	 * synchronize with logsync barrier
498	 */
499	if (test_bit(log_SYNCBARRIER, &log->flag) && log->active == 0) {
500		/* forward log syncpt */
501		/* lmSync(log); */
502
503		jFYI(1, ("     log barrier off: 0x%x\n", log->lsn));
504
505		/* enable new transactions start */
506		clear_bit(log_SYNCBARRIER, &log->flag);
507
508		/* wakeup all waitors for logsync barrier */
509		TXN_WAKEUP(&log->syncwait);
510	}
511
512	/*
513	 * wakeup all waitors for a free tblock
514	 */
515	TXN_WAKEUP(&TxAnchor.freewait);
516
517	TXN_UNLOCK();
518	jFYI(1, ("txEnd: exitting\n"));
519}
520
521
522/*
523 *      txLock()
524 *
525 * function: acquire a transaction lock on the specified <mp>
526 *
527 * parameter:
528 *
529 * return:      transaction lock id
530 *
531 * serialization:
532 */
533struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
534		     int type)
535{
536	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
537	int dir_xtree = 0;
538	lid_t lid;
539	tid_t xtid;
540	struct tlock *tlck;
541	struct xtlock *xtlck;
542	struct linelock *linelock;
543	xtpage_t *p;
544	struct tblock *tblk;
545
546	assert(!test_cflag(COMMIT_Nolink, ip));
547
548	TXN_LOCK();
549
550	if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) &&
551	    !(mp->xflag & COMMIT_PAGE)) {
552		/*
553		 * Directory inode is special.  It can have both an xtree tlock
554		 * and a dtree tlock associated with it.
555		 */
556		dir_xtree = 1;
557		lid = jfs_ip->xtlid;
558	} else
559		lid = mp->lid;
560
561	/* is page not locked by a transaction ? */
562	if (lid == 0)
563		goto allocateLock;
564
565	jFYI(1, ("txLock: tid:%d ip:0x%p mp:0x%p lid:%d\n",
566		 tid, ip, mp, lid));
567
568	/* is page locked by the requester transaction ? */
569	tlck = lid_to_tlock(lid);
570	if ((xtid = tlck->tid) == tid)
571		goto grantLock;
572
573	/*
574	 * is page locked by anonymous transaction/lock ?
575	 *
576	 * (page update without transaction (i.e., file write) is
577	 * locked under anonymous transaction tid = 0:
578	 * anonymous tlocks maintained on anonymous tlock list of
579	 * the inode of the page and available to all anonymous
580	 * transactions until txCommit() time at which point
581	 * they are transferred to the transaction tlock list of
582	 * the commiting transaction of the inode)
583	 */
584	if (xtid == 0) {
585		tlck->tid = tid;
586		tblk = tid_to_tblock(tid);
587		/*
588		 * The order of the tlocks in the transaction is important
589		 * (during truncate, child xtree pages must be freed before
590		 * parent's tlocks change the working map).
591		 * Take tlock off anonymous list and add to tail of
592		 * transaction list
593		 *
594		 * Note:  We really need to get rid of the tid & lid and
595		 * use list_head's.  This code is getting UGLY!
596		 */
597		if (jfs_ip->atlhead == lid) {
598			if (jfs_ip->atltail == lid) {
599				/* only anonymous txn.
600				 * Remove from anon_list
601				 */
602				list_del_init(&jfs_ip->anon_inode_list);
603			}
604			jfs_ip->atlhead = tlck->next;
605		} else {
606			lid_t last;
607			for (last = jfs_ip->atlhead;
608			     lid_to_tlock(last)->next != lid;
609			     last = lid_to_tlock(last)->next) {
610				assert(last);
611			}
612			lid_to_tlock(last)->next = tlck->next;
613			if (jfs_ip->atltail == lid)
614				jfs_ip->atltail = last;
615		}
616
617		/* insert the tlock at tail of transaction tlock list */
618
619		if (tblk->next)
620			lid_to_tlock(tblk->last)->next = lid;
621		else
622			tblk->next = lid;
623		tlck->next = 0;
624		tblk->last = lid;
625
626		goto grantLock;
627	}
628
629	goto waitLock;
630
631	/*
632	 * allocate a tlock
633	 */
634      allocateLock:
635	lid = txLockAlloc();
636	tlck = lid_to_tlock(lid);
637
638	/*
639	 * initialize tlock
640	 */
641	tlck->tid = tid;
642
643	/* mark tlock for meta-data page */
644	if (mp->xflag & COMMIT_PAGE) {
645
646		tlck->flag = tlckPAGELOCK;
647
648		/* mark the page dirty and nohomeok */
649		mark_metapage_dirty(mp);
650		atomic_inc(&mp->nohomeok);
651
652		jFYI(1,
653		     ("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p\n",
654		      mp, atomic_read(&mp->nohomeok), tid, tlck));
655
656		/* if anonymous transaction, and buffer is on the group
657		 * commit synclist, mark inode to show this.  This will
658		 * prevent the buffer from being marked nohomeok for too
659		 * long a time.
660		 */
661		if ((tid == 0) && mp->lsn)
662			set_cflag(COMMIT_Synclist, ip);
663	}
664	/* mark tlock for in-memory inode */
665	else
666		tlck->flag = tlckINODELOCK;
667
668	tlck->type = 0;
669
670	/* bind the tlock and the page */
671	tlck->ip = ip;
672	tlck->mp = mp;
673	if (dir_xtree)
674		jfs_ip->xtlid = lid;
675	else
676		mp->lid = lid;
677
678	/*
679	 * enqueue transaction lock to transaction/inode
680	 */
681	/* insert the tlock at tail of transaction tlock list */
682	if (tid) {
683		tblk = tid_to_tblock(tid);
684		if (tblk->next)
685			lid_to_tlock(tblk->last)->next = lid;
686		else
687			tblk->next = lid;
688		tlck->next = 0;
689		tblk->last = lid;
690	}
691	/* anonymous transaction:
692	 * insert the tlock at head of inode anonymous tlock list
693	 */
694	else {
695		tlck->next = jfs_ip->atlhead;
696		jfs_ip->atlhead = lid;
697		if (tlck->next == 0) {
698			/* This inode's first anonymous transaction */
699			jfs_ip->atltail = lid;
700			list_add_tail(&jfs_ip->anon_inode_list,
701				      &TxAnchor.anon_list);
702		}
703	}
704
705	/* initialize type dependent area for linelock */
706	linelock = (struct linelock *) & tlck->lock;
707	linelock->next = 0;
708	linelock->flag = tlckLINELOCK;
709	linelock->maxcnt = TLOCKSHORT;
710	linelock->index = 0;
711
712	switch (type & tlckTYPE) {
713	case tlckDTREE:
714		linelock->l2linesize = L2DTSLOTSIZE;
715		break;
716
717	case tlckXTREE:
718		linelock->l2linesize = L2XTSLOTSIZE;
719
720		xtlck = (struct xtlock *) linelock;
721		xtlck->header.offset = 0;
722		xtlck->header.length = 2;
723
724		if (type & tlckNEW) {
725			xtlck->lwm.offset = XTENTRYSTART;
726		} else {
727			if (mp->xflag & COMMIT_PAGE)
728				p = (xtpage_t *) mp->data;
729			else
730				p = &jfs_ip->i_xtroot;
731			xtlck->lwm.offset =
732			    le16_to_cpu(p->header.nextindex);
733		}
734		xtlck->lwm.length = 0;	/* ! */
735		xtlck->twm.offset = 0;
736		xtlck->hwm.offset = 0;
737
738		xtlck->index = 2;
739		break;
740
741	case tlckINODE:
742		linelock->l2linesize = L2INODESLOTSIZE;
743		break;
744
745	case tlckDATA:
746		linelock->l2linesize = L2DATASLOTSIZE;
747		break;
748
749	default:
750		jERROR(1, ("UFO tlock:0x%p\n", tlck));
751	}
752
753	/*
754	 * update tlock vector
755	 */
756      grantLock:
757	tlck->type |= type;
758
759	TXN_UNLOCK();
760
761	return tlck;
762
763	/*
764	 * page is being locked by another transaction:
765	 */
766      waitLock:
767	/* Only locks on ipimap or ipaimap should reach here */
768	/* assert(jfs_ip->fileset == AGGREGATE_I); */
769	if (jfs_ip->fileset != AGGREGATE_I) {
770		jERROR(1, ("txLock: trying to lock locked page!\n"));
771		dump_mem("ip", ip, sizeof(struct inode));
772		dump_mem("mp", mp, sizeof(struct metapage));
773		dump_mem("Locker's tblk", tid_to_tblock(tid),
774			 sizeof(struct tblock));
775		dump_mem("Tlock", tlck, sizeof(struct tlock));
776		BUG();
777	}
778	INCREMENT(stattx.waitlock);	/* statistics */
779	release_metapage(mp);
780
781	jEVENT(0, ("txLock: in waitLock, tid = %d, xtid = %d, lid = %d\n",
782		   tid, xtid, lid));
783	TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor);
784	jEVENT(0, ("txLock: awakened     tid = %d, lid = %d\n", tid, lid));
785
786	return NULL;
787}
788
789
790/*
791 * NAME:        txRelease()
792 *
793 * FUNCTION:    Release buffers associated with transaction locks, but don't
794 *		mark homeok yet.  The allows other transactions to modify
795 *		buffers, but won't let them go to disk until commit record
796 *		actually gets written.
797 *
798 * PARAMETER:
799 *              tblk    -
800 *
801 * RETURN:      Errors from subroutines.
802 */
803static void txRelease(struct tblock * tblk)
804{
805	struct metapage *mp;
806	lid_t lid;
807	struct tlock *tlck;
808
809	TXN_LOCK();
810
811	for (lid = tblk->next; lid; lid = tlck->next) {
812		tlck = lid_to_tlock(lid);
813		if ((mp = tlck->mp) != NULL &&
814		    (tlck->type & tlckBTROOT) == 0) {
815			assert(mp->xflag & COMMIT_PAGE);
816			mp->lid = 0;
817		}
818	}
819
820	/*
821	 * wakeup transactions waiting on a page locked
822	 * by the current transaction
823	 */
824	TXN_WAKEUP(&tblk->waitor);
825
826	TXN_UNLOCK();
827}
828
829
830/*
831 * NAME:        txUnlock()
832 *
833 * FUNCTION:    Initiates pageout of pages modified by tid in journalled
834 *              objects and frees their lockwords.
835 */
836static void txUnlock(struct tblock * tblk)
837{
838	struct tlock *tlck;
839	struct linelock *linelock;
840	lid_t lid, next, llid, k;
841	struct metapage *mp;
842	struct jfs_log *log;
843	int difft, diffp;
844
845	jFYI(1, ("txUnlock: tblk = 0x%p\n", tblk));
846	log = JFS_SBI(tblk->sb)->log;
847
848	/*
849	 * mark page under tlock homeok (its log has been written):
850	 */
851	for (lid = tblk->next; lid; lid = next) {
852		tlck = lid_to_tlock(lid);
853		next = tlck->next;
854
855		jFYI(1, ("unlocking lid = %d, tlck = 0x%p\n", lid, tlck));
856
857		/* unbind page from tlock */
858		if ((mp = tlck->mp) != NULL &&
859		    (tlck->type & tlckBTROOT) == 0) {
860			assert(mp->xflag & COMMIT_PAGE);
861
862			/* hold buffer
863			 *
864			 * It's possible that someone else has the metapage.
865			 * The only things were changing are nohomeok, which
866			 * is handled atomically, and clsn which is protected
867			 * by the LOGSYNC_LOCK.
868			 */
869			hold_metapage(mp, 1);
870
871			assert(atomic_read(&mp->nohomeok) > 0);
872			atomic_dec(&mp->nohomeok);
873
874			/* inherit younger/larger clsn */
875			LOGSYNC_LOCK(log);
876			if (mp->clsn) {
877				logdiff(difft, tblk->clsn, log);
878				logdiff(diffp, mp->clsn, log);
879				if (difft > diffp)
880					mp->clsn = tblk->clsn;
881			} else
882				mp->clsn = tblk->clsn;
883			LOGSYNC_UNLOCK(log);
884
885			assert(!(tlck->flag & tlckFREEPAGE));
886
887			if (tlck->flag & tlckWRITEPAGE) {
888				write_metapage(mp);
889			} else {
890				/* release page which has been forced */
891				release_metapage(mp);
892			}
893		}
894
895		/* insert tlock, and linelock(s) of the tlock if any,
896		 * at head of freelist
897		 */
898		TXN_LOCK();
899
900		llid = ((struct linelock *) & tlck->lock)->next;
901		while (llid) {
902			linelock = (struct linelock *) lid_to_tlock(llid);
903			k = linelock->next;
904			txLockFree(llid);
905			llid = k;
906		}
907		txLockFree(lid);
908
909		TXN_UNLOCK();
910	}
911	tblk->next = tblk->last = 0;
912
913	/*
914	 * remove tblock from logsynclist
915	 * (allocation map pages inherited lsn of tblk and
916	 * has been inserted in logsync list at txUpdateMap())
917	 */
918	if (tblk->lsn) {
919		LOGSYNC_LOCK(log);
920		log->count--;
921		list_del(&tblk->synclist);
922		LOGSYNC_UNLOCK(log);
923	}
924}
925
926
927/*
928 *      txMaplock()
929 *
930 * function: allocate a transaction lock for freed page/entry;
931 *      for freed page, maplock is used as xtlock/dtlock type;
932 */
933struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
934{
935	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
936	lid_t lid;
937	struct tblock *tblk;
938	struct tlock *tlck;
939	struct maplock *maplock;
940
941	TXN_LOCK();
942
943	/*
944	 * allocate a tlock
945	 */
946	lid = txLockAlloc();
947	tlck = lid_to_tlock(lid);
948
949	/*
950	 * initialize tlock
951	 */
952	tlck->tid = tid;
953
954	/* bind the tlock and the object */
955	tlck->flag = tlckINODELOCK;
956	tlck->ip = ip;
957	tlck->mp = NULL;
958
959	tlck->type = type;
960
961	/*
962	 * enqueue transaction lock to transaction/inode
963	 */
964	/* insert the tlock at tail of transaction tlock list */
965	if (tid) {
966		tblk = tid_to_tblock(tid);
967		if (tblk->next)
968			lid_to_tlock(tblk->last)->next = lid;
969		else
970			tblk->next = lid;
971		tlck->next = 0;
972		tblk->last = lid;
973	}
974	/* anonymous transaction:
975	 * insert the tlock at head of inode anonymous tlock list
976	 */
977	else {
978		tlck->next = jfs_ip->atlhead;
979		jfs_ip->atlhead = lid;
980		if (tlck->next == 0) {
981			/* This inode's first anonymous transaction */
982			jfs_ip->atltail = lid;
983			list_add_tail(&jfs_ip->anon_inode_list,
984				      &TxAnchor.anon_list);
985		}
986	}
987
988	TXN_UNLOCK();
989
990	/* initialize type dependent area for maplock */
991	maplock = (struct maplock *) & tlck->lock;
992	maplock->next = 0;
993	maplock->maxcnt = 0;
994	maplock->index = 0;
995
996	return tlck;
997}
998
999
1000/*
1001 *      txLinelock()
1002 *
1003 * function: allocate a transaction lock for log vector list
1004 */
1005struct linelock *txLinelock(struct linelock * tlock)
1006{
1007	lid_t lid;
1008	struct tlock *tlck;
1009	struct linelock *linelock;
1010
1011	TXN_LOCK();
1012
1013	/* allocate a TxLock structure */
1014	lid = txLockAlloc();
1015	tlck = lid_to_tlock(lid);
1016
1017	TXN_UNLOCK();
1018
1019	/* initialize linelock */
1020	linelock = (struct linelock *) tlck;
1021	linelock->next = 0;
1022	linelock->flag = tlckLINELOCK;
1023	linelock->maxcnt = TLOCKLONG;
1024	linelock->index = 0;
1025
1026	/* append linelock after tlock */
1027	linelock->next = tlock->next;
1028	tlock->next = lid;
1029
1030	return linelock;
1031}
1032
1033
1034
1035/*
1036 *              transaction commit management
1037 *              -----------------------------
1038 */
1039
1040/*
1041 * NAME:        txCommit()
1042 *
1043 * FUNCTION:    commit the changes to the objects specified in
1044 *              clist.  For journalled segments only the
1045 *              changes of the caller are committed, ie by tid.
1046 *              for non-journalled segments the data are flushed to
1047 *              disk and then the change to the disk inode and indirect
1048 *              blocks committed (so blocks newly allocated to the
1049 *              segment will be made a part of the segment atomically).
1050 *
1051 *              all of the segments specified in clist must be in
1052 *              one file system. no more than 6 segments are needed
1053 *              to handle all unix svcs.
1054 *
1055 *              if the i_nlink field (i.e. disk inode link count)
1056 *              is zero, and the type of inode is a regular file or
1057 *              directory, or symbolic link , the inode is truncated
1058 *              to zero length. the truncation is committed but the
1059 *              VM resources are unaffected until it is closed (see
1060 *              iput and iclose).
1061 *
1062 * PARAMETER:
1063 *
1064 * RETURN:
1065 *
1066 * serialization:
1067 *              on entry the inode lock on each segment is assumed
1068 *              to be held.
1069 *
1070 * i/o error:
1071 */
1072int txCommit(tid_t tid,		/* transaction identifier */
1073	     int nip,		/* number of inodes to commit */
1074	     struct inode **iplist,	/* list of inode to commit */
1075	     int flag)
1076{
1077	int rc = 0, rc1 = 0;
1078	struct commit cd;
1079	struct jfs_log *log;
1080	struct tblock *tblk;
1081	struct lrd *lrd;
1082	int lsn;
1083	struct inode *ip;
1084	struct jfs_inode_info *jfs_ip;
1085	int k, n;
1086	ino_t top;
1087	struct super_block *sb;
1088
1089	jFYI(1, ("txCommit, tid = %d, flag = %d\n", tid, flag));
1090	/* is read-only file system ? */
1091	if (isReadOnly(iplist[0])) {
1092		rc = EROFS;
1093		goto TheEnd;
1094	}
1095
1096	sb = cd.sb = iplist[0]->i_sb;
1097	cd.tid = tid;
1098
1099	if (tid == 0)
1100		tid = txBegin(sb, 0);
1101	tblk = tid_to_tblock(tid);
1102
1103	/*
1104	 * initialize commit structure
1105	 */
1106	log = JFS_SBI(sb)->log;
1107	cd.log = log;
1108
1109	/* initialize log record descriptor in commit */
1110	lrd = &cd.lrd;
1111	lrd->logtid = cpu_to_le32(tblk->logtid);
1112	lrd->backchain = 0;
1113
1114	tblk->xflag |= flag;
1115
1116	if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0)
1117		tblk->xflag |= COMMIT_LAZY;
1118	/*
1119	 *      prepare non-journaled objects for commit
1120	 *
1121	 * flush data pages of non-journaled file
1122	 * to prevent the file getting non-initialized disk blocks
1123	 * in case of crash.
1124	 * (new blocks - )
1125	 */
1126	cd.iplist = iplist;
1127	cd.nip = nip;
1128
1129	/*
1130	 *      acquire transaction lock on (on-disk) inodes
1131	 *
1132	 * update on-disk inode from in-memory inode
1133	 * acquiring transaction locks for AFTER records
1134	 * on the on-disk inode of file object
1135	 *
1136	 * sort the inodes array by inode number in descending order
1137	 * to prevent deadlock when acquiring transaction lock
1138	 * of on-disk inodes on multiple on-disk inode pages by
1139	 * multiple concurrent transactions
1140	 */
1141	for (k = 0; k < cd.nip; k++) {
1142		top = (cd.iplist[k])->i_ino;
1143		for (n = k + 1; n < cd.nip; n++) {
1144			ip = cd.iplist[n];
1145			if (ip->i_ino > top) {
1146				top = ip->i_ino;
1147				cd.iplist[n] = cd.iplist[k];
1148				cd.iplist[k] = ip;
1149			}
1150		}
1151
1152		ip = cd.iplist[k];
1153		jfs_ip = JFS_IP(ip);
1154
1155		/*
1156		 * BUGBUG - Should we call filemap_fdatasync here instead
1157		 * of fsync_inode_data?
1158		 * If we do, we have a deadlock condition since we may end
1159		 * up recursively calling jfs_get_block with the IWRITELOCK
1160		 * held.  We may be able to do away with IWRITELOCK while
1161		 * committing transactions and use i_sem instead.
1162		 */
1163		if ((!S_ISDIR(ip->i_mode))
1164		    && (tblk->flag & COMMIT_DELETE) == 0)
1165			fsync_inode_data_buffers(ip);
1166
1167		/*
1168		 * Mark inode as not dirty.  It will still be on the dirty
1169		 * inode list, but we'll know not to commit it again unless
1170		 * it gets marked dirty again
1171		 */
1172		clear_cflag(COMMIT_Dirty, ip);
1173
1174		/* inherit anonymous tlock(s) of inode */
1175		if (jfs_ip->atlhead) {
1176			lid_to_tlock(jfs_ip->atltail)->next = tblk->next;
1177			tblk->next = jfs_ip->atlhead;
1178			if (!tblk->last)
1179				tblk->last = jfs_ip->atltail;
1180			jfs_ip->atlhead = jfs_ip->atltail = 0;
1181			TXN_LOCK();
1182			list_del_init(&jfs_ip->anon_inode_list);
1183			TXN_UNLOCK();
1184		}
1185
1186		/*
1187		 * acquire transaction lock on on-disk inode page
1188		 * (become first tlock of the tblk's tlock list)
1189		 */
1190		if (((rc = diWrite(tid, ip))))
1191			goto out;
1192	}
1193
1194	/*
1195	 *      write log records from transaction locks
1196	 *
1197	 * txUpdateMap() resets XAD_NEW in XAD.
1198	 */
1199	if ((rc = txLog(log, tblk, &cd)))
1200		goto TheEnd;
1201
1202	/*
1203	 * Ensure that inode isn't reused before
1204	 * lazy commit thread finishes processing
1205	 */
1206	if (tblk->xflag & (COMMIT_CREATE | COMMIT_DELETE))
1207		atomic_inc(&tblk->ip->i_count);
1208
1209	ASSERT((!(tblk->xflag & COMMIT_DELETE)) ||
1210	       ((tblk->ip->i_nlink == 0) &&
1211		!test_cflag(COMMIT_Nolink, tblk->ip)));
1212
1213	/*
1214	 *      write COMMIT log record
1215	 */
1216	lrd->type = cpu_to_le16(LOG_COMMIT);
1217	lrd->length = 0;
1218	lsn = lmLog(log, tblk, lrd, NULL);
1219
1220	lmGroupCommit(log, tblk);
1221
1222	/*
1223	 *      - transaction is now committed -
1224	 */
1225
1226	/*
1227	 * force pages in careful update
1228	 * (imap addressing structure update)
1229	 */
1230	if (flag & COMMIT_FORCE)
1231		txForce(tblk);
1232
1233	/*
1234	 *      update allocation map.
1235	 *
1236	 * update inode allocation map and inode:
1237	 * free pager lock on memory object of inode if any.
1238	 * update  block allocation map.
1239	 *
1240	 * txUpdateMap() resets XAD_NEW in XAD.
1241	 */
1242	if (tblk->xflag & COMMIT_FORCE)
1243		txUpdateMap(tblk);
1244
1245	/*
1246	 *      free transaction locks and pageout/free pages
1247	 */
1248	txRelease(tblk);
1249
1250	if ((tblk->flag & tblkGC_LAZY) == 0)
1251		txUnlock(tblk);
1252
1253
1254	/*
1255	 *      reset in-memory object state
1256	 */
1257	for (k = 0; k < cd.nip; k++) {
1258		ip = cd.iplist[k];
1259		jfs_ip = JFS_IP(ip);
1260
1261		/*
1262		 * reset in-memory inode state
1263		 */
1264		jfs_ip->bxflag = 0;
1265		jfs_ip->blid = 0;
1266	}
1267
1268      out:
1269	if (rc != 0)
1270		txAbortCommit(&cd, rc);
1271	else
1272		rc = rc1;
1273
1274      TheEnd:
1275	jFYI(1, ("txCommit: tid = %d, returning %d\n", tid, rc));
1276	return rc;
1277}
1278
1279
1280/*
1281 * NAME:        txLog()
1282 *
1283 * FUNCTION:    Writes AFTER log records for all lines modified
1284 *              by tid for segments specified by inodes in comdata.
1285 *              Code assumes only WRITELOCKS are recorded in lockwords.
1286 *
1287 * PARAMETERS:
1288 *
1289 * RETURN :
1290 */
1291static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
1292{
1293	int rc = 0;
1294	struct inode *ip;
1295	lid_t lid;
1296	struct tlock *tlck;
1297	struct lrd *lrd = &cd->lrd;
1298
1299	/*
1300	 * write log record(s) for each tlock of transaction,
1301	 */
1302	for (lid = tblk->next; lid; lid = tlck->next) {
1303		tlck = lid_to_tlock(lid);
1304
1305		tlck->flag |= tlckLOG;
1306
1307		/* initialize lrd common */
1308		ip = tlck->ip;
1309		lrd->aggregate = cpu_to_le32(kdev_t_to_nr(ip->i_dev));
1310		lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset);
1311		lrd->log.redopage.inode = cpu_to_le32(ip->i_ino);
1312
1313		if (tlck->mp)
1314			hold_metapage(tlck->mp, 0);
1315
1316		/* write log record of page from the tlock */
1317		switch (tlck->type & tlckTYPE) {
1318		case tlckXTREE:
1319			xtLog(log, tblk, lrd, tlck);
1320			break;
1321
1322		case tlckDTREE:
1323			dtLog(log, tblk, lrd, tlck);
1324			break;
1325
1326		case tlckINODE:
1327			diLog(log, tblk, lrd, tlck, cd);
1328			break;
1329
1330		case tlckMAP:
1331			mapLog(log, tblk, lrd, tlck);
1332			break;
1333
1334		case tlckDATA:
1335			dataLog(log, tblk, lrd, tlck);
1336			break;
1337
1338		default:
1339			jERROR(1, ("UFO tlock:0x%p\n", tlck));
1340		}
1341		if (tlck->mp)
1342			release_metapage(tlck->mp);
1343	}
1344
1345	return rc;
1346}
1347
1348
1349/*
1350 *      diLog()
1351 *
1352 * function:    log inode tlock and format maplock to update bmap;
1353 */
1354int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1355	  struct tlock * tlck, struct commit * cd)
1356{
1357	int rc = 0;
1358	struct metapage *mp;
1359	pxd_t *pxd;
1360	struct pxd_lock *pxdlock;
1361
1362	mp = tlck->mp;
1363
1364	/* initialize as REDOPAGE record format */
1365	lrd->log.redopage.type = cpu_to_le16(LOG_INODE);
1366	lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE);
1367
1368	pxd = &lrd->log.redopage.pxd;
1369
1370	/*
1371	 *      inode after image
1372	 */
1373	if (tlck->type & tlckENTRY) {
1374		/* log after-image for logredo(): */
1375		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1376//              *pxd = mp->cm_pxd;
1377		PXDaddress(pxd, mp->index);
1378		PXDlength(pxd,
1379			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1380		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1381
1382		/* mark page as homeward bound */
1383		tlck->flag |= tlckWRITEPAGE;
1384	} else if (tlck->type & tlckFREE) {
1385		/*
1386		 *      free inode extent
1387		 *
1388		 * (pages of the freed inode extent have been invalidated and
1389		 * a maplock for free of the extent has been formatted at
1390		 * txLock() time);
1391		 *
1392		 * the tlock had been acquired on the inode allocation map page
1393		 * (iag) that specifies the freed extent, even though the map
1394		 * page is not itself logged, to prevent pageout of the map
1395		 * page before the log;
1396		 */
1397		assert(tlck->type & tlckFREE);
1398
1399		/* log LOG_NOREDOINOEXT of the freed inode extent for
1400		 * logredo() to start NoRedoPage filters, and to update
1401		 * imap and bmap for free of the extent;
1402		 */
1403		lrd->type = cpu_to_le16(LOG_NOREDOINOEXT);
1404		/*
1405		 * For the LOG_NOREDOINOEXT record, we need
1406		 * to pass the IAG number and inode extent
1407		 * index (within that IAG) from which the
1408		 * the extent being released.  These have been
1409		 * passed to us in the iplist[1] and iplist[2].
1410		 */
1411		lrd->log.noredoinoext.iagnum =
1412		    cpu_to_le32((u32) (size_t) cd->iplist[1]);
1413		lrd->log.noredoinoext.inoext_idx =
1414		    cpu_to_le32((u32) (size_t) cd->iplist[2]);
1415
1416		pxdlock = (struct pxd_lock *) & tlck->lock;
1417		*pxd = pxdlock->pxd;
1418		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1419
1420		/* update bmap */
1421		tlck->flag |= tlckUPDATEMAP;
1422
1423		/* mark page as homeward bound */
1424		tlck->flag |= tlckWRITEPAGE;
1425	} else {
1426		jERROR(2, ("diLog: UFO type tlck:0x%p\n", tlck));
1427	}
1428#ifdef  _JFS_WIP
1429	/*
1430	 *      alloc/free external EA extent
1431	 *
1432	 * a maplock for txUpdateMap() to update bPWMAP for alloc/free
1433	 * of the extent has been formatted at txLock() time;
1434	 */
1435	else {
1436		assert(tlck->type & tlckEA);
1437
1438		/* log LOG_UPDATEMAP for logredo() to update bmap for
1439		 * alloc of new (and free of old) external EA extent;
1440		 */
1441		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1442		pxdlock = (struct pxd_lock *) & tlck->lock;
1443		nlock = pxdlock->index;
1444		for (i = 0; i < nlock; i++, pxdlock++) {
1445			if (pxdlock->flag & mlckALLOCPXD)
1446				lrd->log.updatemap.type =
1447				    cpu_to_le16(LOG_ALLOCPXD);
1448			else
1449				lrd->log.updatemap.type =
1450				    cpu_to_le16(LOG_FREEPXD);
1451			lrd->log.updatemap.nxd = cpu_to_le16(1);
1452			lrd->log.updatemap.pxd = pxdlock->pxd;
1453			lrd->backchain =
1454			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1455		}
1456
1457		/* update bmap */
1458		tlck->flag |= tlckUPDATEMAP;
1459	}
1460#endif				/* _JFS_WIP */
1461
1462	return rc;
1463}
1464
1465
1466/*
1467 *      dataLog()
1468 *
1469 * function:    log data tlock
1470 */
1471int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1472	    struct tlock * tlck)
1473{
1474	struct metapage *mp;
1475	pxd_t *pxd;
1476	int rc;
1477	s64 xaddr;
1478	int xflag;
1479	s32 xlen;
1480
1481	mp = tlck->mp;
1482
1483	/* initialize as REDOPAGE record format */
1484	lrd->log.redopage.type = cpu_to_le16(LOG_DATA);
1485	lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE);
1486
1487	pxd = &lrd->log.redopage.pxd;
1488
1489	/* log after-image for logredo(): */
1490	lrd->type = cpu_to_le16(LOG_REDOPAGE);
1491
1492	if (JFS_IP(tlck->ip)->next_index < MAX_INLINE_DIRTABLE_ENTRY) {
1493		/*
1494		 * The table has been truncated, we've must have deleted
1495		 * the last entry, so don't bother logging this
1496		 */
1497		mp->lid = 0;
1498		atomic_dec(&mp->nohomeok);
1499		discard_metapage(mp);
1500		tlck->mp = 0;
1501		return 0;
1502	}
1503
1504	rc = xtLookup(tlck->ip, mp->index, 1, &xflag, &xaddr, &xlen, 1);
1505	if (rc || (xlen == 0)) {
1506		jERROR(1, ("dataLog: can't find physical address\n"));
1507		return 0;
1508	}
1509
1510	PXDaddress(pxd, xaddr);
1511	PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits);
1512
1513	lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1514
1515	/* mark page as homeward bound */
1516	tlck->flag |= tlckWRITEPAGE;
1517
1518	return 0;
1519}
1520
1521
1522/*
1523 *      dtLog()
1524 *
1525 * function:    log dtree tlock and format maplock to update bmap;
1526 */
1527void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1528	   struct tlock * tlck)
1529{
1530	struct inode *ip;
1531	struct metapage *mp;
1532	struct pxd_lock *pxdlock;
1533	pxd_t *pxd;
1534
1535	ip = tlck->ip;
1536	mp = tlck->mp;
1537
1538	/* initialize as REDOPAGE/NOREDOPAGE record format */
1539	lrd->log.redopage.type = cpu_to_le16(LOG_DTREE);
1540	lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE);
1541
1542	pxd = &lrd->log.redopage.pxd;
1543
1544	if (tlck->type & tlckBTROOT)
1545		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
1546
1547	/*
1548	 *      page extension via relocation: entry insertion;
1549	 *      page extension in-place: entry insertion;
1550	 *      new right page from page split, reinitialized in-line
1551	 *      root from root page split: entry insertion;
1552	 */
1553	if (tlck->type & (tlckNEW | tlckEXTEND)) {
1554		/* log after-image of the new page for logredo():
1555		 * mark log (LOG_NEW) for logredo() to initialize
1556		 * freelist and update bmap for alloc of the new page;
1557		 */
1558		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1559		if (tlck->type & tlckEXTEND)
1560			lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND);
1561		else
1562			lrd->log.redopage.type |= cpu_to_le16(LOG_NEW);
1563//              *pxd = mp->cm_pxd;
1564		PXDaddress(pxd, mp->index);
1565		PXDlength(pxd,
1566			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1567		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1568
1569		/* format a maplock for txUpdateMap() to update bPMAP for
1570		 * alloc of the new page;
1571		 */
1572		if (tlck->type & tlckBTROOT)
1573			return;
1574		tlck->flag |= tlckUPDATEMAP;
1575		pxdlock = (struct pxd_lock *) & tlck->lock;
1576		pxdlock->flag = mlckALLOCPXD;
1577		pxdlock->pxd = *pxd;
1578
1579		pxdlock->index = 1;
1580
1581		/* mark page as homeward bound */
1582		tlck->flag |= tlckWRITEPAGE;
1583		return;
1584	}
1585
1586	/*
1587	 *      entry insertion/deletion,
1588	 *      sibling page link update (old right page before split);
1589	 */
1590	if (tlck->type & (tlckENTRY | tlckRELINK)) {
1591		/* log after-image for logredo(): */
1592		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1593		PXDaddress(pxd, mp->index);
1594		PXDlength(pxd,
1595			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1596		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1597
1598		/* mark page as homeward bound */
1599		tlck->flag |= tlckWRITEPAGE;
1600		return;
1601	}
1602
1603	/*
1604	 *      page deletion: page has been invalidated
1605	 *      page relocation: source extent
1606	 *
1607	 *      a maplock for free of the page has been formatted
1608	 *      at txLock() time);
1609	 */
1610	if (tlck->type & (tlckFREE | tlckRELOCATE)) {
1611		/* log LOG_NOREDOPAGE of the deleted page for logredo()
1612		 * to start NoRedoPage filter and to update bmap for free
1613		 * of the deletd page
1614		 */
1615		lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
1616		pxdlock = (struct pxd_lock *) & tlck->lock;
1617		*pxd = pxdlock->pxd;
1618		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1619
1620		/* a maplock for txUpdateMap() for free of the page
1621		 * has been formatted at txLock() time;
1622		 */
1623		tlck->flag |= tlckUPDATEMAP;
1624	}
1625	return;
1626}
1627
1628
1629/*
1630 *      xtLog()
1631 *
1632 * function:    log xtree tlock and format maplock to update bmap;
1633 */
1634void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1635	   struct tlock * tlck)
1636{
1637	struct inode *ip;
1638	struct metapage *mp;
1639	xtpage_t *p;
1640	struct xtlock *xtlck;
1641	struct maplock *maplock;
1642	struct xdlistlock *xadlock;
1643	struct pxd_lock *pxdlock;
1644	pxd_t *pxd;
1645	int next, lwm, hwm;
1646
1647	ip = tlck->ip;
1648	mp = tlck->mp;
1649
1650	/* initialize as REDOPAGE/NOREDOPAGE record format */
1651	lrd->log.redopage.type = cpu_to_le16(LOG_XTREE);
1652	lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE);
1653
1654	pxd = &lrd->log.redopage.pxd;
1655
1656	if (tlck->type & tlckBTROOT) {
1657		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
1658		p = &JFS_IP(ip)->i_xtroot;
1659		if (S_ISDIR(ip->i_mode))
1660			lrd->log.redopage.type |=
1661			    cpu_to_le16(LOG_DIR_XTREE);
1662	} else
1663		p = (xtpage_t *) mp->data;
1664	next = le16_to_cpu(p->header.nextindex);
1665
1666	xtlck = (struct xtlock *) & tlck->lock;
1667
1668	maplock = (struct maplock *) & tlck->lock;
1669	xadlock = (struct xdlistlock *) maplock;
1670
1671	/*
1672	 *      entry insertion/extension;
1673	 *      sibling page link update (old right page before split);
1674	 */
1675	if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) {
1676		/* log after-image for logredo():
1677		 * logredo() will update bmap for alloc of new/extended
1678		 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
1679		 * after-image of XADlist;
1680		 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
1681		 * applying the after-image to the meta-data page.
1682		 */
1683		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1684//              *pxd = mp->cm_pxd;
1685		PXDaddress(pxd, mp->index);
1686		PXDlength(pxd,
1687			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1688		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1689
1690		/* format a maplock for txUpdateMap() to update bPMAP
1691		 * for alloc of new/extended extents of XAD[lwm:next)
1692		 * from the page itself;
1693		 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
1694		 */
1695		lwm = xtlck->lwm.offset;
1696		if (lwm == 0)
1697			lwm = XTPAGEMAXSLOT;
1698
1699		if (lwm == next)
1700			goto out;
1701		assert(lwm < next);
1702		tlck->flag |= tlckUPDATEMAP;
1703		xadlock->flag = mlckALLOCXADLIST;
1704		xadlock->count = next - lwm;
1705		if ((xadlock->count <= 2) && (tblk->xflag & COMMIT_LAZY)) {
1706			int i;
1707			/*
1708			 * Lazy commit may allow xtree to be modified before
1709			 * txUpdateMap runs.  Copy xad into linelock to
1710			 * preserve correct data.
1711			 */
1712			xadlock->xdlist = &xtlck->pxdlock;
1713			memcpy(xadlock->xdlist, &p->xad[lwm],
1714			       sizeof(xad_t) * xadlock->count);
1715
1716			for (i = 0; i < xadlock->count; i++)
1717				p->xad[lwm + i].flag &=
1718				    ~(XAD_NEW | XAD_EXTENDED);
1719		} else {
1720			/*
1721			 * xdlist will point to into inode's xtree, ensure
1722			 * that transaction is not committed lazily.
1723			 */
1724			xadlock->xdlist = &p->xad[lwm];
1725			tblk->xflag &= ~COMMIT_LAZY;
1726		}
1727		jFYI(1,
1728		     ("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d count:%d\n",
1729		      tlck->ip, mp, tlck, lwm, xadlock->count));
1730
1731		maplock->index = 1;
1732
1733	      out:
1734		/* mark page as homeward bound */
1735		tlck->flag |= tlckWRITEPAGE;
1736
1737		return;
1738	}
1739
1740	/*
1741	 *      page deletion: file deletion/truncation (ref. xtTruncate())
1742	 *
1743	 * (page will be invalidated after log is written and bmap
1744	 * is updated from the page);
1745	 */
1746	if (tlck->type & tlckFREE) {
1747		/* LOG_NOREDOPAGE log for NoRedoPage filter:
1748		 * if page free from file delete, NoRedoFile filter from
1749		 * inode image of zero link count will subsume NoRedoPage
1750		 * filters for each page;
1751		 * if page free from file truncattion, write NoRedoPage
1752		 * filter;
1753		 *
1754		 * upadte of block allocation map for the page itself:
1755		 * if page free from deletion and truncation, LOG_UPDATEMAP
1756		 * log for the page itself is generated from processing
1757		 * its parent page xad entries;
1758		 */
1759		/* if page free from file truncation, log LOG_NOREDOPAGE
1760		 * of the deleted page for logredo() to start NoRedoPage
1761		 * filter for the page;
1762		 */
1763		if (tblk->xflag & COMMIT_TRUNCATE) {
1764			/* write NOREDOPAGE for the page */
1765			lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
1766			PXDaddress(pxd, mp->index);
1767			PXDlength(pxd,
1768				  mp->logical_size >> tblk->sb->
1769				  s_blocksize_bits);
1770			lrd->backchain =
1771			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1772
1773			if (tlck->type & tlckBTROOT) {
1774				/* Empty xtree must be logged */
1775				lrd->type = cpu_to_le16(LOG_REDOPAGE);
1776				lrd->backchain =
1777				    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1778			}
1779		}
1780
1781		/* init LOG_UPDATEMAP of the freed extents
1782		 * XAD[XTENTRYSTART:hwm) from the deleted page itself
1783		 * for logredo() to update bmap;
1784		 */
1785		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1786		lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST);
1787		xtlck = (struct xtlock *) & tlck->lock;
1788		hwm = xtlck->hwm.offset;
1789		lrd->log.updatemap.nxd =
1790		    cpu_to_le16(hwm - XTENTRYSTART + 1);
1791		/* reformat linelock for lmLog() */
1792		xtlck->header.offset = XTENTRYSTART;
1793		xtlck->header.length = hwm - XTENTRYSTART + 1;
1794		xtlck->index = 1;
1795		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1796
1797		/* format a maplock for txUpdateMap() to update bmap
1798		 * to free extents of XAD[XTENTRYSTART:hwm) from the
1799		 * deleted page itself;
1800		 */
1801		tlck->flag |= tlckUPDATEMAP;
1802		xadlock->flag = mlckFREEXADLIST;
1803		xadlock->count = hwm - XTENTRYSTART + 1;
1804		if ((xadlock->count <= 2) && (tblk->xflag & COMMIT_LAZY)) {
1805			/*
1806			 * Lazy commit may allow xtree to be modified before
1807			 * txUpdateMap runs.  Copy xad into linelock to
1808			 * preserve correct data.
1809			 */
1810			xadlock->xdlist = &xtlck->pxdlock;
1811			memcpy(xadlock->xdlist, &p->xad[XTENTRYSTART],
1812			       sizeof(xad_t) * xadlock->count);
1813		} else {
1814			/*
1815			 * xdlist will point to into inode's xtree, ensure
1816			 * that transaction is not committed lazily unless
1817			 * we're deleting the inode (unlink).  In that case
1818			 * we have special logic for the inode to be
1819			 * unlocked by the lazy commit thread.
1820			 */
1821			xadlock->xdlist = &p->xad[XTENTRYSTART];
1822			if ((tblk->xflag & COMMIT_LAZY) &&
1823			    (tblk->xflag & COMMIT_DELETE) &&
1824			    (tblk->ip == ip))
1825				set_cflag(COMMIT_Holdlock, ip);
1826			else
1827				tblk->xflag &= ~COMMIT_LAZY;
1828		}
1829		jFYI(1,
1830		     ("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2\n",
1831		      tlck->ip, mp, xadlock->count));
1832
1833		maplock->index = 1;
1834
1835		/* mark page as invalid */
1836		if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode))
1837		    && !(tlck->type & tlckBTROOT))
1838			tlck->flag |= tlckFREEPAGE;
1839		/*
1840		   else (tblk->xflag & COMMIT_PMAP)
1841		   ? release the page;
1842		 */
1843		return;
1844	}
1845
1846	/*
1847	 *      page/entry truncation: file truncation (ref. xtTruncate())
1848	 *
1849	 *     |----------+------+------+---------------|
1850	 *                |      |      |
1851	 *                |      |     hwm - hwm before truncation
1852	 *                |     next - truncation point
1853	 *               lwm - lwm before truncation
1854	 * header ?
1855	 */
1856	if (tlck->type & tlckTRUNCATE) {
1857		pxd_t tpxd;	/* truncated extent of xad */
1858		int twm;
1859
1860		/*
1861		 * For truncation the entire linelock may be used, so it would
1862		 * be difficult to store xad list in linelock itself.
1863		 * Therefore, we'll just force transaction to be committed
1864		 * synchronously, so that xtree pages won't be changed before
1865		 * txUpdateMap runs.
1866		 */
1867		tblk->xflag &= ~COMMIT_LAZY;
1868		lwm = xtlck->lwm.offset;
1869		if (lwm == 0)
1870			lwm = XTPAGEMAXSLOT;
1871		hwm = xtlck->hwm.offset;
1872		twm = xtlck->twm.offset;
1873
1874		/*
1875		 *      write log records
1876		 */
1877		/*
1878		 * allocate entries XAD[lwm:next]:
1879		 */
1880		if (lwm < next) {
1881			/* log after-image for logredo():
1882			 * logredo() will update bmap for alloc of new/extended
1883			 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
1884			 * after-image of XADlist;
1885			 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
1886			 * applying the after-image to the meta-data page.
1887			 */
1888			lrd->type = cpu_to_le16(LOG_REDOPAGE);
1889			PXDaddress(pxd, mp->index);
1890			PXDlength(pxd,
1891				  mp->logical_size >> tblk->sb->
1892				  s_blocksize_bits);
1893			lrd->backchain =
1894			    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1895		}
1896
1897		/*
1898		 * truncate entry XAD[twm == next - 1]:
1899		 */
1900		if (twm == next - 1) {
1901			/* init LOG_UPDATEMAP for logredo() to update bmap for
1902			 * free of truncated delta extent of the truncated
1903			 * entry XAD[next - 1]:
1904			 * (xtlck->pxdlock = truncated delta extent);
1905			 */
1906			pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
1907			/* assert(pxdlock->type & tlckTRUNCATE); */
1908			lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1909			lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
1910			lrd->log.updatemap.nxd = cpu_to_le16(1);
1911			lrd->log.updatemap.pxd = pxdlock->pxd;
1912			tpxd = pxdlock->pxd;	/* save to format maplock */
1913			lrd->backchain =
1914			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1915		}
1916
1917		/*
1918		 * free entries XAD[next:hwm]:
1919		 */
1920		if (hwm >= next) {
1921			/* init LOG_UPDATEMAP of the freed extents
1922			 * XAD[next:hwm] from the deleted page itself
1923			 * for logredo() to update bmap;
1924			 */
1925			lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1926			lrd->log.updatemap.type =
1927			    cpu_to_le16(LOG_FREEXADLIST);
1928			xtlck = (struct xtlock *) & tlck->lock;
1929			hwm = xtlck->hwm.offset;
1930			lrd->log.updatemap.nxd =
1931			    cpu_to_le16(hwm - next + 1);
1932			/* reformat linelock for lmLog() */
1933			xtlck->header.offset = next;
1934			xtlck->header.length = hwm - next + 1;
1935			xtlck->index = 1;
1936			lrd->backchain =
1937			    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1938		}
1939
1940		/*
1941		 *      format maplock(s) for txUpdateMap() to update bmap
1942		 */
1943		maplock->index = 0;
1944
1945		/*
1946		 * allocate entries XAD[lwm:next):
1947		 */
1948		if (lwm < next) {
1949			/* format a maplock for txUpdateMap() to update bPMAP
1950			 * for alloc of new/extended extents of XAD[lwm:next)
1951			 * from the page itself;
1952			 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
1953			 */
1954			tlck->flag |= tlckUPDATEMAP;
1955			xadlock->flag = mlckALLOCXADLIST;
1956			xadlock->count = next - lwm;
1957			xadlock->xdlist = &p->xad[lwm];
1958
1959			jFYI(1,
1960			     ("xtLog: alloc ip:0x%p mp:0x%p count:%d lwm:%d next:%d\n",
1961			      tlck->ip, mp, xadlock->count, lwm, next));
1962			maplock->index++;
1963			xadlock++;
1964		}
1965
1966		/*
1967		 * truncate entry XAD[twm == next - 1]:
1968		 */
1969		if (twm == next - 1) {
1970			struct pxd_lock *pxdlock;
1971
1972			/* format a maplock for txUpdateMap() to update bmap
1973			 * to free truncated delta extent of the truncated
1974			 * entry XAD[next - 1];
1975			 * (xtlck->pxdlock = truncated delta extent);
1976			 */
1977			tlck->flag |= tlckUPDATEMAP;
1978			pxdlock = (struct pxd_lock *) xadlock;
1979			pxdlock->flag = mlckFREEPXD;
1980			pxdlock->count = 1;
1981			pxdlock->pxd = tpxd;
1982
1983			jFYI(1,
1984			     ("xtLog: truncate ip:0x%p mp:0x%p count:%d hwm:%d\n",
1985			      ip, mp, pxdlock->count, hwm));
1986			maplock->index++;
1987			xadlock++;
1988		}
1989
1990		/*
1991		 * free entries XAD[next:hwm]:
1992		 */
1993		if (hwm >= next) {
1994			/* format a maplock for txUpdateMap() to update bmap
1995			 * to free extents of XAD[next:hwm] from thedeleted
1996			 * page itself;
1997			 */
1998			tlck->flag |= tlckUPDATEMAP;
1999			xadlock->flag = mlckFREEXADLIST;
2000			xadlock->count = hwm - next + 1;
2001			xadlock->xdlist = &p->xad[next];
2002
2003			jFYI(1,
2004			     ("xtLog: free ip:0x%p mp:0x%p count:%d next:%d hwm:%d\n",
2005			      tlck->ip, mp, xadlock->count, next, hwm));
2006			maplock->index++;
2007		}
2008
2009		/* mark page as homeward bound */
2010		tlck->flag |= tlckWRITEPAGE;
2011	}
2012	return;
2013}
2014
2015
2016/*
2017 *      mapLog()
2018 *
2019 * function:    log from maplock of freed data extents;
2020 */
2021void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
2022	    struct tlock * tlck)
2023{
2024	struct pxd_lock *pxdlock;
2025	int i, nlock;
2026	pxd_t *pxd;
2027
2028	/*
2029	 *      page relocation: free the source page extent
2030	 *
2031	 * a maplock for txUpdateMap() for free of the page
2032	 * has been formatted at txLock() time saving the src
2033	 * relocated page address;
2034	 */
2035	if (tlck->type & tlckRELOCATE) {
2036		/* log LOG_NOREDOPAGE of the old relocated page
2037		 * for logredo() to start NoRedoPage filter;
2038		 */
2039		lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
2040		pxdlock = (struct pxd_lock *) & tlck->lock;
2041		pxd = &lrd->log.redopage.pxd;
2042		*pxd = pxdlock->pxd;
2043		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2044
2045		/* (N.B. currently, logredo() does NOT update bmap
2046		 * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE);
2047		 * if page free from relocation, LOG_UPDATEMAP log is
2048		 * specifically generated now for logredo()
2049		 * to update bmap for free of src relocated page;
2050		 * (new flag LOG_RELOCATE may be introduced which will
2051		 * inform logredo() to start NORedoPage filter and also
2052		 * update block allocation map at the same time, thus
2053		 * avoiding an extra log write);
2054		 */
2055		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
2056		lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
2057		lrd->log.updatemap.nxd = cpu_to_le16(1);
2058		lrd->log.updatemap.pxd = pxdlock->pxd;
2059		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2060
2061		/* a maplock for txUpdateMap() for free of the page
2062		 * has been formatted at txLock() time;
2063		 */
2064		tlck->flag |= tlckUPDATEMAP;
2065		return;
2066	}
2067	/*
2068
2069	 * Otherwise it's not a relocate request
2070	 *
2071	 */
2072	else {
2073		/* log LOG_UPDATEMAP for logredo() to update bmap for
2074		 * free of truncated/relocated delta extent of the data;
2075		 * e.g.: external EA extent, relocated/truncated extent
2076		 * from xtTailgate();
2077		 */
2078		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
2079		pxdlock = (struct pxd_lock *) & tlck->lock;
2080		nlock = pxdlock->index;
2081		for (i = 0; i < nlock; i++, pxdlock++) {
2082			if (pxdlock->flag & mlckALLOCPXD)
2083				lrd->log.updatemap.type =
2084				    cpu_to_le16(LOG_ALLOCPXD);
2085			else
2086				lrd->log.updatemap.type =
2087				    cpu_to_le16(LOG_FREEPXD);
2088			lrd->log.updatemap.nxd = cpu_to_le16(1);
2089			lrd->log.updatemap.pxd = pxdlock->pxd;
2090			lrd->backchain =
2091			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2092			jFYI(1, ("mapLog: xaddr:0x%lx xlen:0x%x\n",
2093				 (ulong) addressPXD(&pxdlock->pxd),
2094				 lengthPXD(&pxdlock->pxd)));
2095		}
2096
2097		/* update bmap */
2098		tlck->flag |= tlckUPDATEMAP;
2099	}
2100}
2101
2102
2103/*
2104 *      txEA()
2105 *
2106 * function:    acquire maplock for EA/ACL extents or
2107 *              set COMMIT_INLINE flag;
2108 */
2109void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
2110{
2111	struct tlock *tlck = NULL;
2112	struct pxd_lock *maplock = NULL, *pxdlock = NULL;
2113
2114	/*
2115	 * format maplock for alloc of new EA extent
2116	 */
2117	if (newea) {
2118		/* Since the newea could be a completely zeroed entry we need to
2119		 * check for the two flags which indicate we should actually
2120		 * commit new EA data
2121		 */
2122		if (newea->flag & DXD_EXTENT) {
2123			tlck = txMaplock(tid, ip, tlckMAP);
2124			maplock = (struct pxd_lock *) & tlck->lock;
2125			pxdlock = (struct pxd_lock *) maplock;
2126			pxdlock->flag = mlckALLOCPXD;
2127			PXDaddress(&pxdlock->pxd, addressDXD(newea));
2128			PXDlength(&pxdlock->pxd, lengthDXD(newea));
2129			pxdlock++;
2130			maplock->index = 1;
2131		} else if (newea->flag & DXD_INLINE) {
2132			tlck = NULL;
2133
2134			set_cflag(COMMIT_Inlineea, ip);
2135		}
2136	}
2137
2138	/*
2139	 * format maplock for free of old EA extent
2140	 */
2141	if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) {
2142		if (tlck == NULL) {
2143			tlck = txMaplock(tid, ip, tlckMAP);
2144			maplock = (struct pxd_lock *) & tlck->lock;
2145			pxdlock = (struct pxd_lock *) maplock;
2146			maplock->index = 0;
2147		}
2148		pxdlock->flag = mlckFREEPXD;
2149		PXDaddress(&pxdlock->pxd, addressDXD(oldea));
2150		PXDlength(&pxdlock->pxd, lengthDXD(oldea));
2151		maplock->index++;
2152	}
2153}
2154
2155
2156/*
2157 *      txForce()
2158 *
2159 * function: synchronously write pages locked by transaction
2160 *              after txLog() but before txUpdateMap();
2161 */
2162void txForce(struct tblock * tblk)
2163{
2164	struct tlock *tlck;
2165	lid_t lid, next;
2166	struct metapage *mp;
2167
2168	/*
2169	 * reverse the order of transaction tlocks in
2170	 * careful update order of address index pages
2171	 * (right to left, bottom up)
2172	 */
2173	tlck = lid_to_tlock(tblk->next);
2174	lid = tlck->next;
2175	tlck->next = 0;
2176	while (lid) {
2177		tlck = lid_to_tlock(lid);
2178		next = tlck->next;
2179		tlck->next = tblk->next;
2180		tblk->next = lid;
2181		lid = next;
2182	}
2183
2184	/*
2185	 * synchronously write the page, and
2186	 * hold the page for txUpdateMap();
2187	 */
2188	for (lid = tblk->next; lid; lid = next) {
2189		tlck = lid_to_tlock(lid);
2190		next = tlck->next;
2191
2192		if ((mp = tlck->mp) != NULL &&
2193		    (tlck->type & tlckBTROOT) == 0) {
2194			assert(mp->xflag & COMMIT_PAGE);
2195
2196			if (tlck->flag & tlckWRITEPAGE) {
2197				tlck->flag &= ~tlckWRITEPAGE;
2198
2199				/* do not release page to freelist */
2200				assert(atomic_read(&mp->nohomeok));
2201				hold_metapage(mp, 0);
2202				write_metapage(mp);
2203			}
2204		}
2205	}
2206}
2207
2208
2209/*
2210 *      txUpdateMap()
2211 *
2212 * function:    update persistent allocation map (and working map
2213 *              if appropriate);
2214 *
2215 * parameter:
2216 */
2217static void txUpdateMap(struct tblock * tblk)
2218{
2219	struct inode *ip;
2220	struct inode *ipimap;
2221	lid_t lid;
2222	struct tlock *tlck;
2223	struct maplock *maplock;
2224	struct pxd_lock pxdlock;
2225	int maptype;
2226	int k, nlock;
2227	struct metapage *mp = 0;
2228
2229	ipimap = JFS_SBI(tblk->sb)->ipimap;
2230
2231	maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP;
2232
2233
2234	/*
2235	 *      update block allocation map
2236	 *
2237	 * update allocation state in pmap (and wmap) and
2238	 * update lsn of the pmap page;
2239	 */
2240	/*
2241	 * scan each tlock/page of transaction for block allocation/free:
2242	 *
2243	 * for each tlock/page of transaction, update map.
2244	 *  ? are there tlock for pmap and pwmap at the same time ?
2245	 */
2246	for (lid = tblk->next; lid; lid = tlck->next) {
2247		tlck = lid_to_tlock(lid);
2248
2249		if ((tlck->flag & tlckUPDATEMAP) == 0)
2250			continue;
2251
2252		if (tlck->flag & tlckFREEPAGE) {
2253			/*
2254			 * Another thread may attempt to reuse freed space
2255			 * immediately, so we want to get rid of the metapage
2256			 * before anyone else has a chance to get it.
2257			 * Lock metapage, update maps, then invalidate
2258			 * the metapage.
2259			 */
2260			mp = tlck->mp;
2261			ASSERT(mp->xflag & COMMIT_PAGE);
2262			hold_metapage(mp, 0);
2263		}
2264
2265		/*
2266		 * extent list:
2267		 * . in-line PXD list:
2268		 * . out-of-line XAD list:
2269		 */
2270		maplock = (struct maplock *) & tlck->lock;
2271		nlock = maplock->index;
2272
2273		for (k = 0; k < nlock; k++, maplock++) {
2274			/*
2275			 * allocate blocks in persistent map:
2276			 *
2277			 * blocks have been allocated from wmap at alloc time;
2278			 */
2279			if (maplock->flag & mlckALLOC) {
2280				txAllocPMap(ipimap, maplock, tblk);
2281			}
2282			/*
2283			 * free blocks in persistent and working map:
2284			 * blocks will be freed in pmap and then in wmap;
2285			 *
2286			 * ? tblock specifies the PMAP/PWMAP based upon
2287			 * transaction
2288			 *
2289			 * free blocks in persistent map:
2290			 * blocks will be freed from wmap at last reference
2291			 * release of the object for regular files;
2292			 *
2293			 * Alway free blocks from both persistent & working
2294			 * maps for directories
2295			 */
2296			else {	/* (maplock->flag & mlckFREE) */
2297
2298				if (S_ISDIR(tlck->ip->i_mode))
2299					txFreeMap(ipimap, maplock,
2300						  tblk, COMMIT_PWMAP);
2301				else
2302					txFreeMap(ipimap, maplock,
2303						  tblk, maptype);
2304			}
2305		}
2306		if (tlck->flag & tlckFREEPAGE) {
2307			if (!(tblk->flag & tblkGC_LAZY)) {
2308				/* This is equivalent to txRelease */
2309				ASSERT(mp->lid == lid);
2310				tlck->mp->lid = 0;
2311			}
2312			assert(atomic_read(&mp->nohomeok) == 1);
2313			atomic_dec(&mp->nohomeok);
2314			discard_metapage(mp);
2315			tlck->mp = 0;
2316		}
2317	}
2318	/*
2319	 *      update inode allocation map
2320	 *
2321	 * update allocation state in pmap and
2322	 * update lsn of the pmap page;
2323	 * update in-memory inode flag/state
2324	 *
2325	 * unlock mapper/write lock
2326	 */
2327	if (tblk->xflag & COMMIT_CREATE) {
2328		ip = tblk->ip;
2329
2330		ASSERT(test_cflag(COMMIT_New, ip));
2331		clear_cflag(COMMIT_New, ip);
2332
2333		diUpdatePMap(ipimap, ip->i_ino, FALSE, tblk);
2334		ipimap->i_state |= I_DIRTY;
2335		/* update persistent block allocation map
2336		 * for the allocation of inode extent;
2337		 */
2338		pxdlock.flag = mlckALLOCPXD;
2339		pxdlock.pxd = JFS_IP(ip)->ixpxd;
2340		pxdlock.index = 1;
2341		txAllocPMap(ip, (struct maplock *) & pxdlock, tblk);
2342		iput(ip);
2343	} else if (tblk->xflag & COMMIT_DELETE) {
2344		ip = tblk->ip;
2345		diUpdatePMap(ipimap, ip->i_ino, TRUE, tblk);
2346		ipimap->i_state |= I_DIRTY;
2347		if (test_and_clear_cflag(COMMIT_Holdlock, ip)) {
2348			if (tblk->flag & tblkGC_LAZY)
2349				IWRITE_UNLOCK(ip);
2350		}
2351		iput(ip);
2352	}
2353}
2354
2355
2356/*
2357 *      txAllocPMap()
2358 *
2359 * function: allocate from persistent map;
2360 *
2361 * parameter:
2362 *      ipbmap  -
2363 *      malock -
2364 *              xad list:
2365 *              pxd:
2366 *
2367 *      maptype -
2368 *              allocate from persistent map;
2369 *              free from persistent map;
2370 *              (e.g., tmp file - free from working map at releae
2371 *               of last reference);
2372 *              free from persistent and working map;
2373 *
2374 *      lsn     - log sequence number;
2375 */
2376static void txAllocPMap(struct inode *ip, struct maplock * maplock,
2377			struct tblock * tblk)
2378{
2379	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
2380	struct xdlistlock *xadlistlock;
2381	xad_t *xad;
2382	s64 xaddr;
2383	int xlen;
2384	struct pxd_lock *pxdlock;
2385	struct xdlistlock *pxdlistlock;
2386	pxd_t *pxd;
2387	int n;
2388
2389	/*
2390	 * allocate from persistent map;
2391	 */
2392	if (maplock->flag & mlckALLOCXADLIST) {
2393		xadlistlock = (struct xdlistlock *) maplock;
2394		xad = xadlistlock->xdlist;
2395		for (n = 0; n < xadlistlock->count; n++, xad++) {
2396			if (xad->flag & (XAD_NEW | XAD_EXTENDED)) {
2397				xaddr = addressXAD(xad);
2398				xlen = lengthXAD(xad);
2399				dbUpdatePMap(ipbmap, FALSE, xaddr,
2400					     (s64) xlen, tblk);
2401				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
2402				jFYI(1,
2403				     ("allocPMap: xaddr:0x%lx xlen:%d\n",
2404				      (ulong) xaddr, xlen));
2405			}
2406		}
2407	} else if (maplock->flag & mlckALLOCPXD) {
2408		pxdlock = (struct pxd_lock *) maplock;
2409		xaddr = addressPXD(&pxdlock->pxd);
2410		xlen = lengthPXD(&pxdlock->pxd);
2411		dbUpdatePMap(ipbmap, FALSE, xaddr, (s64) xlen, tblk);
2412		jFYI(1,
2413		     ("allocPMap: xaddr:0x%lx xlen:%d\n", (ulong) xaddr,
2414		      xlen));
2415	} else {		/* (maplock->flag & mlckALLOCPXDLIST) */
2416
2417		pxdlistlock = (struct xdlistlock *) maplock;
2418		pxd = pxdlistlock->xdlist;
2419		for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2420			xaddr = addressPXD(pxd);
2421			xlen = lengthPXD(pxd);
2422			dbUpdatePMap(ipbmap, FALSE, xaddr, (s64) xlen,
2423				     tblk);
2424			jFYI(1,
2425			     ("allocPMap: xaddr:0x%lx xlen:%d\n",
2426			      (ulong) xaddr, xlen));
2427		}
2428	}
2429}
2430
2431
2432/*
2433 *      txFreeMap()
2434 *
2435 * function:    free from persistent and/or working map;
2436 *
2437 * todo: optimization
2438 */
2439void txFreeMap(struct inode *ip,
2440	       struct maplock * maplock, struct tblock * tblk, int maptype)
2441{
2442	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
2443	struct xdlistlock *xadlistlock;
2444	xad_t *xad;
2445	s64 xaddr;
2446	int xlen;
2447	struct pxd_lock *pxdlock;
2448	struct xdlistlock *pxdlistlock;
2449	pxd_t *pxd;
2450	int n;
2451
2452	jFYI(1,
2453	     ("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x\n",
2454	      tblk, maplock, maptype));
2455
2456	/*
2457	 * free from persistent map;
2458	 */
2459	if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) {
2460		if (maplock->flag & mlckFREEXADLIST) {
2461			xadlistlock = (struct xdlistlock *) maplock;
2462			xad = xadlistlock->xdlist;
2463			for (n = 0; n < xadlistlock->count; n++, xad++) {
2464				if (!(xad->flag & XAD_NEW)) {
2465					xaddr = addressXAD(xad);
2466					xlen = lengthXAD(xad);
2467					dbUpdatePMap(ipbmap, TRUE, xaddr,
2468						     (s64) xlen, tblk);
2469					jFYI(1,
2470					     ("freePMap: xaddr:0x%lx xlen:%d\n",
2471					      (ulong) xaddr, xlen));
2472				}
2473			}
2474		} else if (maplock->flag & mlckFREEPXD) {
2475			pxdlock = (struct pxd_lock *) maplock;
2476			xaddr = addressPXD(&pxdlock->pxd);
2477			xlen = lengthPXD(&pxdlock->pxd);
2478			dbUpdatePMap(ipbmap, TRUE, xaddr, (s64) xlen,
2479				     tblk);
2480			jFYI(1,
2481			     ("freePMap: xaddr:0x%lx xlen:%d\n",
2482			      (ulong) xaddr, xlen));
2483		} else {	/* (maplock->flag & mlckALLOCPXDLIST) */
2484
2485			pxdlistlock = (struct xdlistlock *) maplock;
2486			pxd = pxdlistlock->xdlist;
2487			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2488				xaddr = addressPXD(pxd);
2489				xlen = lengthPXD(pxd);
2490				dbUpdatePMap(ipbmap, TRUE, xaddr,
2491					     (s64) xlen, tblk);
2492				jFYI(1,
2493				     ("freePMap: xaddr:0x%lx xlen:%d\n",
2494				      (ulong) xaddr, xlen));
2495			}
2496		}
2497	}
2498
2499	/*
2500	 * free from working map;
2501	 */
2502	if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) {
2503		if (maplock->flag & mlckFREEXADLIST) {
2504			xadlistlock = (struct xdlistlock *) maplock;
2505			xad = xadlistlock->xdlist;
2506			for (n = 0; n < xadlistlock->count; n++, xad++) {
2507				xaddr = addressXAD(xad);
2508				xlen = lengthXAD(xad);
2509				dbFree(ip, xaddr, (s64) xlen);
2510				xad->flag = 0;
2511				jFYI(1,
2512				     ("freeWMap: xaddr:0x%lx xlen:%d\n",
2513				      (ulong) xaddr, xlen));
2514			}
2515		} else if (maplock->flag & mlckFREEPXD) {
2516			pxdlock = (struct pxd_lock *) maplock;
2517			xaddr = addressPXD(&pxdlock->pxd);
2518			xlen = lengthPXD(&pxdlock->pxd);
2519			dbFree(ip, xaddr, (s64) xlen);
2520			jFYI(1,
2521			     ("freeWMap: xaddr:0x%lx xlen:%d\n",
2522			      (ulong) xaddr, xlen));
2523		} else {	/* (maplock->flag & mlckFREEPXDLIST) */
2524
2525			pxdlistlock = (struct xdlistlock *) maplock;
2526			pxd = pxdlistlock->xdlist;
2527			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2528				xaddr = addressPXD(pxd);
2529				xlen = lengthPXD(pxd);
2530				dbFree(ip, xaddr, (s64) xlen);
2531				jFYI(1,
2532				     ("freeWMap: xaddr:0x%lx xlen:%d\n",
2533				      (ulong) xaddr, xlen));
2534			}
2535		}
2536	}
2537}
2538
2539
2540/*
2541 *      txFreelock()
2542 *
2543 * function:    remove tlock from inode anonymous locklist
2544 */
2545void txFreelock(struct inode *ip)
2546{
2547	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
2548	struct tlock *xtlck, *tlck;
2549	lid_t xlid = 0, lid;
2550
2551	if (!jfs_ip->atlhead)
2552		return;
2553
2554	xtlck = (struct tlock *) &jfs_ip->atlhead;
2555
2556	while ((lid = xtlck->next)) {
2557		tlck = lid_to_tlock(lid);
2558		if (tlck->flag & tlckFREELOCK) {
2559			xtlck->next = tlck->next;
2560			txLockFree(lid);
2561		} else {
2562			xtlck = tlck;
2563			xlid = lid;
2564		}
2565	}
2566
2567	if (jfs_ip->atlhead)
2568		jfs_ip->atltail = xlid;
2569	else {
2570		jfs_ip->atltail = 0;
2571		/*
2572		 * If inode was on anon_list, remove it
2573		 */
2574		TXN_LOCK();
2575		list_del_init(&jfs_ip->anon_inode_list);
2576		TXN_UNLOCK();
2577	}
2578}
2579
2580
2581/*
2582 *      txAbort()
2583 *
2584 * function: abort tx before commit;
2585 *
2586 * frees line-locks and segment locks for all
2587 * segments in comdata structure.
2588 * Optionally sets state of file-system to FM_DIRTY in super-block.
2589 * log age of page-frames in memory for which caller has
2590 * are reset to 0 (to avoid logwarap).
2591 */
2592void txAbort(tid_t tid, int dirty)
2593{
2594	lid_t lid, next;
2595	struct metapage *mp;
2596	struct tblock *tblk = tid_to_tblock(tid);
2597
2598	jEVENT(1, ("txAbort: tid:%d dirty:0x%x\n", tid, dirty));
2599
2600	/*
2601	 * free tlocks of the transaction
2602	 */
2603	for (lid = tblk->next; lid; lid = next) {
2604		next = lid_to_tlock(lid)->next;
2605
2606		mp = lid_to_tlock(lid)->mp;
2607
2608		if (mp) {
2609			mp->lid = 0;
2610
2611			/*
2612			 * reset lsn of page to avoid logwarap:
2613			 *
2614			 * (page may have been previously committed by another
2615			 * transaction(s) but has not been paged, i.e.,
2616			 * it may be on logsync list even though it has not
2617			 * been logged for the current tx.)
2618			 */
2619			if (mp->xflag & COMMIT_PAGE && mp->lsn)
2620				LogSyncRelease(mp);
2621		}
2622		/* insert tlock at head of freelist */
2623		TXN_LOCK();
2624		txLockFree(lid);
2625		TXN_UNLOCK();
2626	}
2627
2628	/* caller will free the transaction block */
2629
2630	tblk->next = tblk->last = 0;
2631
2632	/*
2633	 * mark filesystem dirty
2634	 */
2635	if (dirty)
2636		updateSuper(tblk->sb, FM_DIRTY);
2637
2638	return;
2639}
2640
2641
2642/*
2643 *      txAbortCommit()
2644 *
2645 * function: abort commit.
2646 *
2647 * frees tlocks of transaction; line-locks and segment locks for all
2648 * segments in comdata structure. frees malloc storage
2649 * sets state of file-system to FM_MDIRTY in super-block.
2650 * log age of page-frames in memory for which caller has
2651 * are reset to 0 (to avoid logwarap).
2652 */
2653void txAbortCommit(struct commit * cd, int exval)
2654{
2655	struct tblock *tblk;
2656	tid_t tid;
2657	lid_t lid, next;
2658	struct metapage *mp;
2659
2660	assert(exval == EIO || exval == ENOMEM);
2661	jEVENT(1, ("txAbortCommit: cd:0x%p\n", cd));
2662
2663	/*
2664	 * free tlocks of the transaction
2665	 */
2666	tid = cd->tid;
2667	tblk = tid_to_tblock(tid);
2668	for (lid = tblk->next; lid; lid = next) {
2669		next = lid_to_tlock(lid)->next;
2670
2671		mp = lid_to_tlock(lid)->mp;
2672		if (mp) {
2673			mp->lid = 0;
2674
2675			/*
2676			 * reset lsn of page to avoid logwarap;
2677			 */
2678			if (mp->xflag & COMMIT_PAGE)
2679				LogSyncRelease(mp);
2680		}
2681
2682		/* insert tlock at head of freelist */
2683		TXN_LOCK();
2684		txLockFree(lid);
2685		TXN_UNLOCK();
2686	}
2687
2688	tblk->next = tblk->last = 0;
2689
2690	/* free the transaction block */
2691	txEnd(tid);
2692
2693	/*
2694	 * mark filesystem dirty
2695	 */
2696	updateSuper(cd->sb, FM_DIRTY);
2697}
2698
2699
2700/*
2701 *      txLazyCommit(void)
2702 *
2703 *	All transactions except those changing ipimap (COMMIT_FORCE) are
2704 *	processed by this routine.  This insures that the inode and block
2705 *	allocation maps are updated in order.  For synchronous transactions,
2706 *	let the user thread finish processing after txUpdateMap() is called.
2707 */
2708void txLazyCommit(struct tblock * tblk)
2709{
2710	struct jfs_log *log;
2711
2712	while (((tblk->flag & tblkGC_READY) == 0) &&
2713	       ((tblk->flag & tblkGC_UNLOCKED) == 0)) {
2714		/* We must have gotten ahead of the user thread
2715		 */
2716		jFYI(1, ("txLazyCommit: tblk 0x%p not unlocked\n", tblk));
2717		schedule();
2718	}
2719
2720	jFYI(1, ("txLazyCommit: processing tblk 0x%p\n", tblk));
2721
2722	txUpdateMap(tblk);
2723
2724	log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
2725
2726	spin_lock_irq(&log->gclock);	// LOGGC_LOCK
2727
2728	tblk->flag |= tblkGC_COMMITTED;
2729
2730	if ((tblk->flag & tblkGC_READY) || (tblk->flag & tblkGC_LAZY))
2731		log->gcrtc--;
2732
2733	if (tblk->flag & tblkGC_READY)
2734		wake_up(&tblk->gcwait);	// LOGGC_WAKEUP
2735
2736	/*
2737	 * Can't release log->gclock until we've tested tblk->flag
2738	 */
2739	if (tblk->flag & tblkGC_LAZY) {
2740		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
2741		txUnlock(tblk);
2742		tblk->flag &= ~tblkGC_LAZY;
2743		txEnd(tblk - TxBlock);	/* Convert back to tid */
2744	} else
2745		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
2746
2747	jFYI(1, ("txLazyCommit: done: tblk = 0x%p\n", tblk));
2748}
2749
2750/*
2751 *      jfs_lazycommit(void)
2752 *
2753 *	To be run as a kernel daemon.  If lbmIODone is called in an interrupt
2754 *	context, or where blocking is not wanted, this routine will process
2755 *	committed transactions from the unlock queue.
2756 */
2757int jfs_lazycommit(void *arg)
2758{
2759	int WorkDone;
2760	struct tblock *tblk;
2761	unsigned long flags;
2762
2763	lock_kernel();
2764
2765	daemonize();
2766	current->tty = NULL;
2767	strcpy(current->comm, "jfsCommit");
2768
2769	unlock_kernel();
2770
2771	jfsCommitTask = current;
2772
2773	spin_lock_irq(&current->sigmask_lock);
2774	sigfillset(&current->blocked);
2775	recalc_sigpending(current);
2776	spin_unlock_irq(&current->sigmask_lock);
2777
2778	LAZY_LOCK_INIT();
2779	TxAnchor.unlock_queue = TxAnchor.unlock_tail = 0;
2780
2781	complete(&jfsIOwait);
2782
2783	do {
2784		DECLARE_WAITQUEUE(wq, current);
2785
2786		LAZY_LOCK(flags);
2787restart:
2788		WorkDone = 0;
2789		while ((tblk = TxAnchor.unlock_queue)) {
2790			/*
2791			 * We can't get ahead of user thread.  Spinning is
2792			 * simpler than blocking/waking.  We shouldn't spin
2793			 * very long, since user thread shouldn't be blocking
2794			 * between lmGroupCommit & txEnd.
2795			 */
2796			WorkDone = 1;
2797
2798			/*
2799			 * Remove first transaction from queue
2800			 */
2801			TxAnchor.unlock_queue = tblk->cqnext;
2802			tblk->cqnext = 0;
2803			if (TxAnchor.unlock_tail == tblk)
2804				TxAnchor.unlock_tail = 0;
2805
2806			LAZY_UNLOCK(flags);
2807			txLazyCommit(tblk);
2808
2809			/*
2810			 * We can be running indefinately if other processors
2811			 * are adding transactions to this list
2812			 */
2813			cond_resched();
2814			LAZY_LOCK(flags);
2815		}
2816
2817		if (WorkDone)
2818			goto restart;
2819
2820		add_wait_queue(&jfs_commit_thread_wait, &wq);
2821		set_current_state(TASK_INTERRUPTIBLE);
2822		LAZY_UNLOCK(flags);
2823		schedule();
2824		current->state = TASK_RUNNING;
2825		remove_wait_queue(&jfs_commit_thread_wait, &wq);
2826	} while (!jfs_stop_threads);
2827
2828	if (TxAnchor.unlock_queue)
2829		jERROR(1, ("jfs_lazycommit being killed with pending transactions!\n"));
2830	else
2831		jFYI(1, ("jfs_lazycommit being killed\n"));
2832	complete(&jfsIOwait);
2833	return 0;
2834}
2835
2836void txLazyUnlock(struct tblock * tblk)
2837{
2838	unsigned long flags;
2839
2840	LAZY_LOCK(flags);
2841
2842	if (TxAnchor.unlock_tail)
2843		TxAnchor.unlock_tail->cqnext = tblk;
2844	else
2845		TxAnchor.unlock_queue = tblk;
2846	TxAnchor.unlock_tail = tblk;
2847	tblk->cqnext = 0;
2848	LAZY_UNLOCK(flags);
2849	wake_up(&jfs_commit_thread_wait);
2850}
2851
2852static void LogSyncRelease(struct metapage * mp)
2853{
2854	struct jfs_log *log = mp->log;
2855
2856	assert(atomic_read(&mp->nohomeok));
2857	assert(log);
2858	atomic_dec(&mp->nohomeok);
2859
2860	if (atomic_read(&mp->nohomeok))
2861		return;
2862
2863	hold_metapage(mp, 0);
2864
2865	LOGSYNC_LOCK(log);
2866	mp->log = NULL;
2867	mp->lsn = 0;
2868	mp->clsn = 0;
2869	log->count--;
2870	list_del_init(&mp->synclist);
2871	LOGSYNC_UNLOCK(log);
2872
2873	release_metapage(mp);
2874}
2875
2876/*
2877 *	txQuiesce
2878 *
2879 *	Block all new transactions and push anonymous transactions to
2880 *	completion
2881 *
2882 *	This does almost the same thing as jfs_sync below.  We don't
2883 *	worry about deadlocking when TlocksLow is set, since we would
2884 *	expect jfs_sync to get us out of that jam.
2885 */
2886void txQuiesce(struct super_block *sb)
2887{
2888	struct inode *ip;
2889	struct jfs_inode_info *jfs_ip;
2890	struct jfs_log *log = JFS_SBI(sb)->log;
2891	int rc;
2892	tid_t tid;
2893
2894	set_bit(log_QUIESCE, &log->flag);
2895
2896	TXN_LOCK();
2897restart:
2898	while (!list_empty(&TxAnchor.anon_list)) {
2899		jfs_ip = list_entry(TxAnchor.anon_list.next,
2900				    struct jfs_inode_info,
2901				    anon_inode_list);
2902		ip = jfs_ip->inode;
2903
2904		/*
2905		 * inode will be removed from anonymous list
2906		 * when it is committed
2907		 */
2908		TXN_UNLOCK();
2909		tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE);
2910		down(&jfs_ip->commit_sem);
2911		rc = txCommit(tid, 1, &ip, 0);
2912		txEnd(tid);
2913		up(&jfs_ip->commit_sem);
2914		/*
2915		 * Just to be safe.  I don't know how
2916		 * long we can run without blocking
2917		 */
2918		cond_resched();
2919		TXN_LOCK();
2920	}
2921
2922	/*
2923	 * If jfs_sync is running in parallel, there could be some inodes
2924	 * on anon_list2.  Let's check.
2925	 */
2926	if (!list_empty(&TxAnchor.anon_list2)) {
2927		list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list);
2928		INIT_LIST_HEAD(&TxAnchor.anon_list2);
2929		goto restart;
2930	}
2931	TXN_UNLOCK();
2932}
2933
2934/*
2935 * txResume()
2936 *
2937 * Allows transactions to start again following txQuiesce
2938 */
2939void txResume(struct super_block *sb)
2940{
2941	struct jfs_log *log = JFS_SBI(sb)->log;
2942
2943	clear_bit(log_QUIESCE, &log->flag);
2944	TXN_WAKEUP(&log->syncwait);
2945}
2946
2947/*
2948 *      jfs_sync(void)
2949 *
2950 *	To be run as a kernel daemon.  This is awakened when tlocks run low.
2951 *	We write any inodes that have anonymous tlocks so they will become
2952 *	available.
2953 */
2954int jfs_sync(void *arg)
2955{
2956	struct inode *ip;
2957	struct jfs_inode_info *jfs_ip;
2958	int rc;
2959	tid_t tid;
2960
2961	lock_kernel();
2962
2963	daemonize();
2964	current->tty = NULL;
2965	strcpy(current->comm, "jfsSync");
2966
2967	unlock_kernel();
2968
2969	spin_lock_irq(&current->sigmask_lock);
2970	sigfillset(&current->blocked);
2971	recalc_sigpending(current);
2972	spin_unlock_irq(&current->sigmask_lock);
2973
2974	complete(&jfsIOwait);
2975
2976	do {
2977		DECLARE_WAITQUEUE(wq, current);
2978		/*
2979		 * write each inode on the anonymous inode list
2980		 */
2981		TXN_LOCK();
2982		while (TlocksLow && !list_empty(&TxAnchor.anon_list)) {
2983			jfs_ip = list_entry(TxAnchor.anon_list.next,
2984					    struct jfs_inode_info,
2985					    anon_inode_list);
2986			ip = jfs_ip->inode;
2987
2988			/*
2989			 * down_trylock returns 0 on success.  This is
2990			 * inconsistent with spin_trylock.
2991			 */
2992			if (! down_trylock(&jfs_ip->commit_sem)) {
2993				/*
2994				 * inode will be removed from anonymous list
2995				 * when it is committed
2996				 */
2997				TXN_UNLOCK();
2998				tid = txBegin(ip->i_sb,
2999					      COMMIT_INODE | COMMIT_FORCE);
3000				rc = txCommit(tid, 1, &ip, 0);
3001				txEnd(tid);
3002				up(&jfs_ip->commit_sem);
3003				/*
3004				 * Just to be safe.  I don't know how
3005				 * long we can run without blocking
3006				 */
3007				cond_resched();
3008				TXN_LOCK();
3009			} else {
3010				/* We can't get the commit semaphore.  It may
3011				 * be held by a thread waiting for tlock's
3012				 * so let's not block here.  Save it to
3013				 * put back on the anon_list.
3014				 */
3015
3016				/* Take off anon_list */
3017				list_del(&jfs_ip->anon_inode_list);
3018
3019				/* Put on anon_list2 */
3020				list_add(&jfs_ip->anon_inode_list,
3021					 &TxAnchor.anon_list2);
3022			}
3023		}
3024		/* Add anon_list2 back to anon_list */
3025		if (!list_empty(&TxAnchor.anon_list2)) {
3026			list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list);
3027			INIT_LIST_HEAD(&TxAnchor.anon_list2);
3028		}
3029		add_wait_queue(&jfs_sync_thread_wait, &wq);
3030		set_current_state(TASK_INTERRUPTIBLE);
3031		TXN_UNLOCK();
3032		schedule();
3033		current->state = TASK_RUNNING;
3034		remove_wait_queue(&jfs_sync_thread_wait, &wq);
3035	} while (!jfs_stop_threads);
3036
3037	jFYI(1, ("jfs_sync being killed\n"));
3038	complete(&jfsIOwait);
3039	return 0;
3040}
3041
3042#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
3043int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
3044		      int *eof, void *data)
3045{
3046	int len = 0;
3047	off_t begin;
3048	char *freewait;
3049	char *freelockwait;
3050	char *lowlockwait;
3051
3052	freewait =
3053	    waitqueue_active(&TxAnchor.freewait) ? "active" : "empty";
3054	freelockwait =
3055	    waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty";
3056	lowlockwait =
3057	    waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
3058
3059	len += sprintf(buffer,
3060		       "JFS TxAnchor\n"
3061		       "============\n"
3062		       "freetid = %d\n"
3063		       "freewait = %s\n"
3064		       "freelock = %d\n"
3065		       "freelockwait = %s\n"
3066		       "lowlockwait = %s\n"
3067		       "tlocksInUse = %d\n"
3068		       "unlock_queue = 0x%p\n"
3069		       "unlock_tail = 0x%p\n",
3070		       TxAnchor.freetid,
3071		       freewait,
3072		       TxAnchor.freelock,
3073		       freelockwait,
3074		       lowlockwait,
3075		       TxAnchor.tlocksInUse,
3076		       TxAnchor.unlock_queue,
3077		       TxAnchor.unlock_tail);
3078
3079	begin = offset;
3080	*start = buffer + begin;
3081	len -= begin;
3082
3083	if (len > length)
3084		len = length;
3085	else
3086		*eof = 1;
3087
3088	if (len < 0)
3089		len = 0;
3090
3091	return len;
3092}
3093#endif
3094